From 95de5094f5ac50b6f355f4e7dffcb6f34bd5dada Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Tue, 22 Sep 2020 18:24:29 +0800
Subject: firmware: imx: add dummy functions

add dummy functions to avoid build failure when header files
are included, but drivers are not built.

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 include/linux/firmware/imx/ipc.h      | 13 +++++++++++++
 include/linux/firmware/imx/sci.h      | 27 +++++++++++++++++++++++++++
 include/linux/firmware/imx/svc/misc.h | 19 +++++++++++++++++++
 3 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/imx/ipc.h b/include/linux/firmware/imx/ipc.h
index 891057434858..0b4643571625 100644
--- a/include/linux/firmware/imx/ipc.h
+++ b/include/linux/firmware/imx/ipc.h
@@ -34,6 +34,7 @@ struct imx_sc_rpc_msg {
 	uint8_t func;
 };
 
+#ifdef CONFIG_IMX_SCU
 /*
  * This is an function to send an RPC message over an IPC channel.
  * It is called by client-side SCFW API function shims.
@@ -55,4 +56,16 @@ int imx_scu_call_rpc(struct imx_sc_ipc *ipc, void *msg, bool have_resp);
  * @return Returns an error code (0 = success, failed if < 0)
  */
 int imx_scu_get_handle(struct imx_sc_ipc **ipc);
+#else
+static inline int imx_scu_call_rpc(struct imx_sc_ipc *ipc, void *msg,
+				   bool have_resp)
+{
+	return -ENOTSUPP;
+}
+
+static inline int imx_scu_get_handle(struct imx_sc_ipc **ipc)
+{
+	return -ENOTSUPP;
+}
+#endif
 #endif /* _SC_IPC_H */
diff --git a/include/linux/firmware/imx/sci.h b/include/linux/firmware/imx/sci.h
index 22c76571a294..5cc63fe7e84d 100644
--- a/include/linux/firmware/imx/sci.h
+++ b/include/linux/firmware/imx/sci.h
@@ -16,9 +16,36 @@
 #include <linux/firmware/imx/svc/pm.h>
 #include <linux/firmware/imx/svc/rm.h>
 
+#if IS_ENABLED(CONFIG_IMX_SCU)
 int imx_scu_enable_general_irq_channel(struct device *dev);
 int imx_scu_irq_register_notifier(struct notifier_block *nb);
 int imx_scu_irq_unregister_notifier(struct notifier_block *nb);
 int imx_scu_irq_group_enable(u8 group, u32 mask, u8 enable);
 int imx_scu_soc_init(struct device *dev);
+#else
+static inline int imx_scu_soc_init(struct device *dev)
+{
+	return -ENOTSUPP;
+}
+
+static inline int imx_scu_enable_general_irq_channel(struct device *dev)
+{
+	return -ENOTSUPP;
+}
+
+static inline int imx_scu_irq_register_notifier(struct notifier_block *nb)
+{
+	return -ENOTSUPP;
+}
+
+static inline int imx_scu_irq_unregister_notifier(struct notifier_block *nb)
+{
+	return -ENOTSUPP;
+}
+
+static inline int imx_scu_irq_group_enable(u8 group, u32 mask, u8 enable)
+{
+	return -ENOTSUPP;
+}
+#endif
 #endif /* _SC_SCI_H */
diff --git a/include/linux/firmware/imx/svc/misc.h b/include/linux/firmware/imx/svc/misc.h
index 031dd4d3c766..760db08a67fc 100644
--- a/include/linux/firmware/imx/svc/misc.h
+++ b/include/linux/firmware/imx/svc/misc.h
@@ -46,6 +46,7 @@ enum imx_misc_func {
  * Control Functions
  */
 
+#ifdef CONFIG_IMX_SCU
 int imx_sc_misc_set_control(struct imx_sc_ipc *ipc, u32 resource,
 			    u8 ctrl, u32 val);
 
@@ -54,5 +55,23 @@ int imx_sc_misc_get_control(struct imx_sc_ipc *ipc, u32 resource,
 
 int imx_sc_pm_cpu_start(struct imx_sc_ipc *ipc, u32 resource,
 			bool enable, u64 phys_addr);
+#else
+static inline int imx_sc_misc_set_control(struct imx_sc_ipc *ipc,
+					  u32 resource, u8 ctrl, u32 val)
+{
+	return -ENOTSUPP;
+}
 
+static inline int imx_sc_misc_get_control(struct imx_sc_ipc *ipc,
+					  u32 resource, u8 ctrl, u32 *val)
+{
+	return -ENOTSUPP;
+}
+
+static inline int imx_sc_pm_cpu_start(struct imx_sc_ipc *ipc, u32 resource,
+				      bool enable, u64 phys_addr)
+{
+	return -ENOTSUPP;
+}
+#endif
 #endif /* _SC_MISC_API_H */
-- 
cgit v1.2.3


From 39613eaad3ceff320da344427a70c655e783475e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 28 Oct 2020 12:16:10 +0530
Subject: qcom-geni-se: remove has_opp_table

has_opp_table isn't used anymore, remove it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>

Link: https://lore.kernel.org/r/08ec1ee1d4252a266956abb5f1e0e0026d753564.1603867487.git.viresh.kumar@linaro.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/qcom-geni-se.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h
index f7bbea3f09ca..ec2ad4b0fe14 100644
--- a/include/linux/qcom-geni-se.h
+++ b/include/linux/qcom-geni-se.h
@@ -48,7 +48,6 @@ struct geni_icc_path {
  * @clk_perf_tbl:	Table of clock frequency input to serial engine clock
  * @icc_paths:		Array of ICC paths for SE
  * @opp_table:		Pointer to the OPP table
- * @has_opp_table:	Specifies if the SE has an OPP table
  */
 struct geni_se {
 	void __iomem *base;
@@ -59,7 +58,6 @@ struct geni_se {
 	unsigned long *clk_perf_tbl;
 	struct geni_icc_path icc_paths[3];
 	struct opp_table *opp_table;
-	bool has_opp_table;
 };
 
 /* Common SE registers */
-- 
cgit v1.2.3


From f1f37abbe6fc2b1242f78157db76e48dbf9518ee Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 19 Oct 2020 15:40:46 +0200
Subject: gpio: Retire the explicit gpio irqchip code

Now that all gpiolib irqchip users have been over to use
the irqchip template, we can finally retire the old code
path and leave just one way in to the irqchip: set up the
template when registering the gpio_chip. For a while
we had two code paths for this which was a bit confusing.

This brings this work to a conclusion, there is now one
way of doing this.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Thierry Reding <thierry.reding@gmail.com>
Link: https://lore.kernel.org/r/20201019134046.65101-1-linus.walleij@linaro.org
---
 Documentation/driver-api/gpio/driver.rst |  63 ++++++++-----
 drivers/gpio/TODO                        |  49 ----------
 drivers/gpio/gpiolib.c                   | 153 -------------------------------
 include/linux/gpio/driver.h              |  71 --------------
 4 files changed, 42 insertions(+), 294 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst
index 072a7455044e..65d708093b71 100644
--- a/Documentation/driver-api/gpio/driver.rst
+++ b/Documentation/driver-api/gpio/driver.rst
@@ -416,7 +416,8 @@ The preferred way to set up the helpers is to fill in the
 struct gpio_irq_chip inside struct gpio_chip before adding the gpio_chip.
 If you do this, the additional irq_chip will be set up by gpiolib at the
 same time as setting up the rest of the GPIO functionality. The following
-is a typical example of a cascaded interrupt handler using gpio_irq_chip:
+is a typical example of a chained cascaded interrupt handler using
+the gpio_irq_chip:
 
 .. code-block:: c
 
@@ -452,7 +453,46 @@ is a typical example of a cascaded interrupt handler using gpio_irq_chip:
 
   return devm_gpiochip_add_data(dev, &g->gc, g);
 
-The helper support using hierarchical interrupt controllers as well.
+The helper supports using threaded interrupts as well. Then you just request
+the interrupt separately and go with it:
+
+.. code-block:: c
+
+  /* Typical state container with dynamic irqchip */
+  struct my_gpio {
+      struct gpio_chip gc;
+      struct irq_chip irq;
+  };
+
+  int irq; /* from platform etc */
+  struct my_gpio *g;
+  struct gpio_irq_chip *girq;
+
+  /* Set up the irqchip dynamically */
+  g->irq.name = "my_gpio_irq";
+  g->irq.irq_ack = my_gpio_ack_irq;
+  g->irq.irq_mask = my_gpio_mask_irq;
+  g->irq.irq_unmask = my_gpio_unmask_irq;
+  g->irq.irq_set_type = my_gpio_set_irq_type;
+
+  ret = devm_request_threaded_irq(dev, irq, NULL,
+		irq_thread_fn, IRQF_ONESHOT, "my-chip", g);
+  if (ret < 0)
+	return ret;
+
+  /* Get a pointer to the gpio_irq_chip */
+  girq = &g->gc.irq;
+  girq->chip = &g->irq;
+  /* This will let us handle the parent IRQ in the driver */
+  girq->parent_handler = NULL;
+  girq->num_parents = 0;
+  girq->parents = NULL;
+  girq->default_type = IRQ_TYPE_NONE;
+  girq->handler = handle_bad_irq;
+
+  return devm_gpiochip_add_data(dev, &g->gc, g);
+
+The helper supports using hierarchical interrupt controllers as well.
 In this case the typical set-up will look like this:
 
 .. code-block:: c
@@ -493,25 +533,6 @@ the parent hardware irq from a child (i.e. this gpio chip) hardware irq.
 As always it is good to look at examples in the kernel tree for advice
 on how to find the required pieces.
 
-The old way of adding irqchips to gpiochips after registration is also still
-available but we try to move away from this:
-
-- DEPRECATED: gpiochip_irqchip_add(): adds a chained cascaded irqchip to a
-  gpiochip. It will pass the struct gpio_chip* for the chip to all IRQ
-  callbacks, so the callbacks need to embed the gpio_chip in its state
-  container and obtain a pointer to the container using container_of().
-  (See Documentation/driver-api/driver-model/design-patterns.rst)
-
-- gpiochip_irqchip_add_nested(): adds a nested cascaded irqchip to a gpiochip,
-  as discussed above regarding different types of cascaded irqchips. The
-  cascaded irq has to be handled by a threaded interrupt handler.
-  Apart from that it works exactly like the chained irqchip.
-
-- gpiochip_set_nested_irqchip(): sets up a nested cascaded irq handler for a
-  gpio_chip from a parent IRQ. As the parent IRQ has usually been
-  explicitly requested by the driver, this does very little more than
-  mark all the child IRQs as having the other IRQ as parent.
-
 If there is a need to exclude certain GPIO lines from the IRQ domain handled by
 these helpers, we can set .irq.need_valid_mask of the gpiochip before
 devm_gpiochip_add_data() or gpiochip_add_data() is called. This allocates an
diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO
index e560e45e84f8..cd04e0b60159 100644
--- a/drivers/gpio/TODO
+++ b/drivers/gpio/TODO
@@ -129,58 +129,9 @@ GPIOLIB irqchip
 The GPIOLIB irqchip is a helper irqchip for "simple cases" that should
 try to cover any generic kind of irqchip cascaded from a GPIO.
 
-- Convert all the GPIOLIB_IRQCHIP users to pass an irqchip template,
-  parent and flags before calling [devm_]gpiochip_add[_data]().
-  Currently we set up the irqchip after setting up the gpiochip
-  using gpiochip_irqchip_add() and gpiochip_set_[chained|nested]_irqchip().
-  This is too complex, so convert all users over to just set up
-  the irqchip before registering the gpio_chip, typical example:
-
-  /* Typical state container with dynamic irqchip */
-  struct my_gpio {
-      struct gpio_chip gc;
-      struct irq_chip irq;
-  };
-
-  int irq; /* from platform etc */
-  struct my_gpio *g;
-  struct gpio_irq_chip *girq;
-
-  /* Set up the irqchip dynamically */
-  g->irq.name = "my_gpio_irq";
-  g->irq.irq_ack = my_gpio_ack_irq;
-  g->irq.irq_mask = my_gpio_mask_irq;
-  g->irq.irq_unmask = my_gpio_unmask_irq;
-  g->irq.irq_set_type = my_gpio_set_irq_type;
-
-  /* Get a pointer to the gpio_irq_chip */
-  girq = &g->gc.irq;
-  girq->chip = &g->irq;
-  girq->parent_handler = ftgpio_gpio_irq_handler;
-  girq->num_parents = 1;
-  girq->parents = devm_kcalloc(dev, 1, sizeof(*girq->parents),
-                               GFP_KERNEL);
-  if (!girq->parents)
-      return -ENOMEM;
-  girq->default_type = IRQ_TYPE_NONE;
-  girq->handler = handle_bad_irq;
-  girq->parents[0] = irq;
-
-  When this is done, we will delete the old APIs for instatiating
-  GPIOLIB_IRQCHIP and simplify the code.
-
 - Look over and identify any remaining easily converted drivers and
   dry-code conversions to gpiolib irqchip for maintainers to test
 
-- Drop gpiochip_set_chained_irqchip() when all the chained irqchips
-  have been converted to the above infrastructure.
-
-- Add more infrastructure to make it possible to also pass a threaded
-  irqchip in struct gpio_irq_chip.
-
-- Drop gpiochip_irqchip_add_nested() when all the chained irqchips
-  have been converted to the above infrastructure.
-
 
 Increase integration with pin control
 
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 3b23a0ca77dd..8e29a60c3697 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -924,67 +924,6 @@ bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gc,
 }
 EXPORT_SYMBOL_GPL(gpiochip_irqchip_irq_valid);
 
-/**
- * gpiochip_set_cascaded_irqchip() - connects a cascaded irqchip to a gpiochip
- * @gc: the gpiochip to set the irqchip chain to
- * @parent_irq: the irq number corresponding to the parent IRQ for this
- * cascaded irqchip
- * @parent_handler: the parent interrupt handler for the accumulated IRQ
- * coming out of the gpiochip. If the interrupt is nested rather than
- * cascaded, pass NULL in this handler argument
- */
-static void gpiochip_set_cascaded_irqchip(struct gpio_chip *gc,
-					  unsigned int parent_irq,
-					  irq_flow_handler_t parent_handler)
-{
-	struct gpio_irq_chip *girq = &gc->irq;
-	struct device *dev = &gc->gpiodev->dev;
-
-	if (!girq->domain) {
-		chip_err(gc, "called %s before setting up irqchip\n",
-			 __func__);
-		return;
-	}
-
-	if (parent_handler) {
-		if (gc->can_sleep) {
-			chip_err(gc,
-				 "you cannot have chained interrupts on a chip that may sleep\n");
-			return;
-		}
-		girq->parents = devm_kcalloc(dev, 1,
-					     sizeof(*girq->parents),
-					     GFP_KERNEL);
-		if (!girq->parents) {
-			chip_err(gc, "out of memory allocating parent IRQ\n");
-			return;
-		}
-		girq->parents[0] = parent_irq;
-		girq->num_parents = 1;
-		/*
-		 * The parent irqchip is already using the chip_data for this
-		 * irqchip, so our callbacks simply use the handler_data.
-		 */
-		irq_set_chained_handler_and_data(parent_irq, parent_handler,
-						 gc);
-	}
-}
-
-/**
- * gpiochip_set_nested_irqchip() - connects a nested irqchip to a gpiochip
- * @gc: the gpiochip to set the irqchip nested handler to
- * @irqchip: the irqchip to nest to the gpiochip
- * @parent_irq: the irq number corresponding to the parent IRQ for this
- * nested irqchip
- */
-void gpiochip_set_nested_irqchip(struct gpio_chip *gc,
-				 struct irq_chip *irqchip,
-				 unsigned int parent_irq)
-{
-	gpiochip_set_cascaded_irqchip(gc, parent_irq, NULL);
-}
-EXPORT_SYMBOL_GPL(gpiochip_set_nested_irqchip);
-
 #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
 
 /**
@@ -1635,98 +1574,6 @@ static void gpiochip_irqchip_remove(struct gpio_chip *gc)
 	gpiochip_irqchip_free_valid_mask(gc);
 }
 
-/**
- * gpiochip_irqchip_add_key() - adds an irqchip to a gpiochip
- * @gc: the gpiochip to add the irqchip to
- * @irqchip: the irqchip to add to the gpiochip
- * @first_irq: if not dynamically assigned, the base (first) IRQ to
- * allocate gpiochip irqs from
- * @handler: the irq handler to use (often a predefined irq core function)
- * @type: the default type for IRQs on this irqchip, pass IRQ_TYPE_NONE
- * to have the core avoid setting up any default type in the hardware.
- * @threaded: whether this irqchip uses a nested thread handler
- * @lock_key: lockdep class for IRQ lock
- * @request_key: lockdep class for IRQ request
- *
- * This function closely associates a certain irqchip with a certain
- * gpiochip, providing an irq domain to translate the local IRQs to
- * global irqs in the gpiolib core, and making sure that the gpiochip
- * is passed as chip data to all related functions. Driver callbacks
- * need to use gpiochip_get_data() to get their local state containers back
- * from the gpiochip passed as chip data. An irqdomain will be stored
- * in the gpiochip that shall be used by the driver to handle IRQ number
- * translation. The gpiochip will need to be initialized and registered
- * before calling this function.
- *
- * This function will handle two cell:ed simple IRQs and assumes all
- * the pins on the gpiochip can generate a unique IRQ. Everything else
- * need to be open coded.
- */
-int gpiochip_irqchip_add_key(struct gpio_chip *gc,
-			     struct irq_chip *irqchip,
-			     unsigned int first_irq,
-			     irq_flow_handler_t handler,
-			     unsigned int type,
-			     bool threaded,
-			     struct lock_class_key *lock_key,
-			     struct lock_class_key *request_key)
-{
-	struct device_node *of_node;
-
-	if (!gc || !irqchip)
-		return -EINVAL;
-
-	if (!gc->parent) {
-		chip_err(gc, "missing gpiochip .dev parent pointer\n");
-		return -EINVAL;
-	}
-	gc->irq.threaded = threaded;
-	of_node = gc->parent->of_node;
-#ifdef CONFIG_OF_GPIO
-	/*
-	 * If the gpiochip has an assigned OF node this takes precedence
-	 * FIXME: get rid of this and use gc->parent->of_node
-	 * everywhere
-	 */
-	if (gc->of_node)
-		of_node = gc->of_node;
-#endif
-	/*
-	 * Specifying a default trigger is a terrible idea if DT or ACPI is
-	 * used to configure the interrupts, as you may end-up with
-	 * conflicting triggers. Tell the user, and reset to NONE.
-	 */
-	if (WARN(of_node && type != IRQ_TYPE_NONE,
-		 "%pOF: Ignoring %d default trigger\n", of_node, type))
-		type = IRQ_TYPE_NONE;
-	if (has_acpi_companion(gc->parent) && type != IRQ_TYPE_NONE) {
-		acpi_handle_warn(ACPI_HANDLE(gc->parent),
-				 "Ignoring %d default trigger\n", type);
-		type = IRQ_TYPE_NONE;
-	}
-
-	gc->irq.chip = irqchip;
-	gc->irq.handler = handler;
-	gc->irq.default_type = type;
-	gc->to_irq = gpiochip_to_irq;
-	gc->irq.lock_key = lock_key;
-	gc->irq.request_key = request_key;
-	gc->irq.domain = irq_domain_add_simple(of_node,
-					gc->ngpio, first_irq,
-					&gpiochip_domain_ops, gc);
-	if (!gc->irq.domain) {
-		gc->irq.chip = NULL;
-		return -EINVAL;
-	}
-
-	gpiochip_set_irq_hooks(gc);
-
-	acpi_gpiochip_request_interrupts(gc);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(gpiochip_irqchip_add_key);
-
 /**
  * gpiochip_irqchip_add_domain() - adds an irqdomain to a gpiochip
  * @gc: the gpiochip to add the irqchip to
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index 4a7e295c3640..286de0520574 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -621,83 +621,12 @@ int gpiochip_irq_domain_activate(struct irq_domain *domain,
 void gpiochip_irq_domain_deactivate(struct irq_domain *domain,
 				    struct irq_data *data);
 
-void gpiochip_set_nested_irqchip(struct gpio_chip *gc,
-		struct irq_chip *irqchip,
-		unsigned int parent_irq);
-
-int gpiochip_irqchip_add_key(struct gpio_chip *gc,
-			     struct irq_chip *irqchip,
-			     unsigned int first_irq,
-			     irq_flow_handler_t handler,
-			     unsigned int type,
-			     bool threaded,
-			     struct lock_class_key *lock_key,
-			     struct lock_class_key *request_key);
-
 bool gpiochip_irqchip_irq_valid(const struct gpio_chip *gc,
 				unsigned int offset);
 
 int gpiochip_irqchip_add_domain(struct gpio_chip *gc,
 				struct irq_domain *domain);
 
-#ifdef CONFIG_LOCKDEP
-
-/*
- * Lockdep requires that each irqchip instance be created with a
- * unique key so as to avoid unnecessary warnings. This upfront
- * boilerplate static inlines provides such a key for each
- * unique instance.
- */
-static inline int gpiochip_irqchip_add(struct gpio_chip *gc,
-				       struct irq_chip *irqchip,
-				       unsigned int first_irq,
-				       irq_flow_handler_t handler,
-				       unsigned int type)
-{
-	static struct lock_class_key lock_key;
-	static struct lock_class_key request_key;
-
-	return gpiochip_irqchip_add_key(gc, irqchip, first_irq,
-					handler, type, false,
-					&lock_key, &request_key);
-}
-
-static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gc,
-			  struct irq_chip *irqchip,
-			  unsigned int first_irq,
-			  irq_flow_handler_t handler,
-			  unsigned int type)
-{
-
-	static struct lock_class_key lock_key;
-	static struct lock_class_key request_key;
-
-	return gpiochip_irqchip_add_key(gc, irqchip, first_irq,
-					handler, type, true,
-					&lock_key, &request_key);
-}
-#else /* ! CONFIG_LOCKDEP */
-static inline int gpiochip_irqchip_add(struct gpio_chip *gc,
-				       struct irq_chip *irqchip,
-				       unsigned int first_irq,
-				       irq_flow_handler_t handler,
-				       unsigned int type)
-{
-	return gpiochip_irqchip_add_key(gc, irqchip, first_irq,
-					handler, type, false, NULL, NULL);
-}
-
-static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gc,
-			  struct irq_chip *irqchip,
-			  unsigned int first_irq,
-			  irq_flow_handler_t handler,
-			  unsigned int type)
-{
-	return gpiochip_irqchip_add_key(gc, irqchip, first_irq,
-					handler, type, true, NULL, NULL);
-}
-#endif /* CONFIG_LOCKDEP */
-
 int gpiochip_generic_request(struct gpio_chip *gc, unsigned int offset);
 void gpiochip_generic_free(struct gpio_chip *gc, unsigned int offset);
 int gpiochip_generic_config(struct gpio_chip *gc, unsigned int offset,
-- 
cgit v1.2.3


From 3ebc0ef06e4a78522e9d1488dcf61b7d8fcfb792 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Fri, 11 Sep 2020 16:33:42 +0200
Subject: serial: s3c: Update path of Samsung S3C machine file

Correct the path to Samsung S3C24xx machine file, mentioned in
documentation.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20200911143343.498-2-krzk@kernel.org
---
 include/linux/serial_s3c.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h
index 463ed28d2b27..ca2c5393dc6b 100644
--- a/include/linux/serial_s3c.h
+++ b/include/linux/serial_s3c.h
@@ -254,7 +254,7 @@
  * serial port
  *
  * the pointer is setup by the machine specific initialisation from the
- * arch/arm/mach-s3c2410/ directory.
+ * arch/arm/mach-s3c/ directory.
 */
 
 struct s3c2410_uartcfg {
-- 
cgit v1.2.3


From 1c552e08b29895d31bbf82bdb549811cfde31db4 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:02 -0700
Subject: firmware: ti_sci: rm: Add support for tx_tdtype parameter for tx
 channel

The system controller's resource manager have support for configuring the
TDTYPE of TCHAN_CFG register on j721e.
With this parameter the teardown completion can be controlled:
TDTYPE == 0: Return without waiting for peer to complete the teardown
TDTYPE == 1: Wait for peer to complete the teardown

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Tero Kristo <t-kristo@ti.com>
Tested-by: Keerthy <j-keerthy@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 1 +
 drivers/firmware/ti_sci.h              | 7 +++++++
 include/linux/soc/ti/ti_sci_protocol.h | 2 ++
 3 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 896f53ec7857..65a8c2e82093 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2362,6 +2362,7 @@ static int ti_sci_cmd_rm_udmap_tx_ch_cfg(const struct ti_sci_handle *handle,
 	req->fdepth = params->fdepth;
 	req->tx_sched_priority = params->tx_sched_priority;
 	req->tx_burst_size = params->tx_burst_size;
+	req->tx_tdtype = params->tx_tdtype;
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 57cd04062994..dca19ca5fc49 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -910,6 +910,7 @@ struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg {
  *   12 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_credit_count
  *   13 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::fdepth
  *   14 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_burst_size
+ *   15 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_tdtype
  *
  * @nav_id: SoC device ID of Navigator Subsystem where tx channel is located
  *
@@ -973,6 +974,11 @@ struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg {
  *
  * @tx_burst_size: UDMAP transmit channel burst size configuration to be
  * programmed into the tx_burst_size field of the TCHAN_TCFG register.
+ *
+ * @tx_tdtype: UDMAP transmit channel teardown type configuration to be
+ * programmed into the tdtype field of the TCHAN_TCFG register:
+ * 0 - Return immediately
+ * 1 - Wait for completion message from remote peer
  */
 struct ti_sci_msg_rm_udmap_tx_ch_cfg_req {
 	struct ti_sci_msg_hdr hdr;
@@ -994,6 +1000,7 @@ struct ti_sci_msg_rm_udmap_tx_ch_cfg_req {
 	u16 fdepth;
 	u8 tx_sched_priority;
 	u8 tx_burst_size;
+	u8 tx_tdtype;
 } __packed;
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index cf27b080e148..d254d99fd45b 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -345,6 +345,7 @@ struct ti_sci_msg_rm_udmap_tx_ch_cfg {
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_SUPR_TDPKT_VALID        BIT(11)
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_CREDIT_COUNT_VALID      BIT(12)
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FDEPTH_VALID            BIT(13)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_TDTYPE_VALID            BIT(15)
 	u16 nav_id;
 	u16 index;
 	u8 tx_pause_on_err;
@@ -362,6 +363,7 @@ struct ti_sci_msg_rm_udmap_tx_ch_cfg {
 	u16 fdepth;
 	u8 tx_sched_priority;
 	u8 tx_burst_size;
+	u8 tx_tdtype;
 };
 
 /**
-- 
cgit v1.2.3


From 967a020bd3deddf9a0af73aeb4d4b46d90030937 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:03 -0700
Subject: firmware: ti_sci: Use struct ti_sci_resource_desc in get_range ops

Use the ti_sci_resource_desc directly and update it's start and num members
directly instead of requiring individual parameters for them.

This will allow easy extension of the RM parameters without changing API.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 32 +++++++++++++++-----------------
 include/linux/soc/ti/ti_sci_protocol.h | 32 ++++++++++++++++----------------
 2 files changed, 31 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 65a8c2e82093..7a777e91ce3e 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -1703,14 +1703,14 @@ fail:
  * @subtype:		Resource assignment subtype that is being requested
  *			from the given device.
  * @s_host:		Host processor ID to which the resources are allocated
- * @range_start:	Start index of the resource range
- * @range_num:		Number of resources in the range
+ * @desc:		Pointer to ti_sci_resource_desc to be updated with the
+ *			resource range start index and number of resources
  *
  * Return: 0 if all went fine, else return appropriate error.
  */
 static int ti_sci_get_resource_range(const struct ti_sci_handle *handle,
 				     u32 dev_id, u8 subtype, u8 s_host,
-				     u16 *range_start, u16 *range_num)
+				     struct ti_sci_resource_desc *desc)
 {
 	struct ti_sci_msg_resp_get_resource_range *resp;
 	struct ti_sci_msg_req_get_resource_range *req;
@@ -1721,7 +1721,7 @@ static int ti_sci_get_resource_range(const struct ti_sci_handle *handle,
 
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
-	if (!handle)
+	if (!handle || !desc)
 		return -EINVAL;
 
 	info = handle_to_ti_sci_info(handle);
@@ -1754,8 +1754,8 @@ static int ti_sci_get_resource_range(const struct ti_sci_handle *handle,
 	} else if (!resp->range_start && !resp->range_num) {
 		ret = -ENODEV;
 	} else {
-		*range_start = resp->range_start;
-		*range_num = resp->range_num;
+		desc->start = resp->range_start;
+		desc->num = resp->range_num;
 	};
 
 fail:
@@ -1771,18 +1771,18 @@ fail:
  * @dev_id:		TISCI device ID.
  * @subtype:		Resource assignment subtype that is being requested
  *			from the given device.
- * @range_start:	Start index of the resource range
- * @range_num:		Number of resources in the range
+ * @desc:		Pointer to ti_sci_resource_desc to be updated with the
+ *			resource range start index and number of resources
  *
  * Return: 0 if all went fine, else return appropriate error.
  */
 static int ti_sci_cmd_get_resource_range(const struct ti_sci_handle *handle,
 					 u32 dev_id, u8 subtype,
-					 u16 *range_start, u16 *range_num)
+					 struct ti_sci_resource_desc *desc)
 {
 	return ti_sci_get_resource_range(handle, dev_id, subtype,
 					 TI_SCI_IRQ_SECONDARY_HOST_INVALID,
-					 range_start, range_num);
+					 desc);
 }
 
 /**
@@ -1793,18 +1793,17 @@ static int ti_sci_cmd_get_resource_range(const struct ti_sci_handle *handle,
  * @subtype:		Resource assignment subtype that is being requested
  *			from the given device.
  * @s_host:		Host processor ID to which the resources are allocated
- * @range_start:	Start index of the resource range
- * @range_num:		Number of resources in the range
+ * @desc:		Pointer to ti_sci_resource_desc to be updated with the
+ *			resource range start index and number of resources
  *
  * Return: 0 if all went fine, else return appropriate error.
  */
 static
 int ti_sci_cmd_get_resource_range_from_shost(const struct ti_sci_handle *handle,
 					     u32 dev_id, u8 subtype, u8 s_host,
-					     u16 *range_start, u16 *range_num)
+					     struct ti_sci_resource_desc *desc)
 {
-	return ti_sci_get_resource_range(handle, dev_id, subtype, s_host,
-					 range_start, range_num);
+	return ti_sci_get_resource_range(handle, dev_id, subtype, s_host, desc);
 }
 
 /**
@@ -3243,8 +3242,7 @@ devm_ti_sci_get_resource_sets(const struct ti_sci_handle *handle,
 	for (i = 0; i < res->sets; i++) {
 		ret = handle->ops.rm_core_ops.get_range(handle, dev_id,
 							sub_types[i],
-							&res->desc[i].start,
-							&res->desc[i].num);
+							&res->desc[i]);
 		if (ret) {
 			dev_dbg(dev, "dev = %d subtype %d not allocated for this host\n",
 				dev_id, sub_types[i]);
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index d254d99fd45b..6cd537db4d33 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -195,6 +195,18 @@ struct ti_sci_clk_ops {
 			u64 *current_freq);
 };
 
+/**
+ * struct ti_sci_resource_desc - Description of TI SCI resource instance range.
+ * @start:	Start index of the resource.
+ * @num:	Number of resources.
+ * @res_map:	Bitmap to manage the allocation of these resources.
+ */
+struct ti_sci_resource_desc {
+	u16 start;
+	u16 num;
+	unsigned long *res_map;
+};
+
 /**
  * struct ti_sci_rm_core_ops - Resource management core operations
  * @get_range:		Get a range of resources belonging to ti sci host.
@@ -209,15 +221,15 @@ struct ti_sci_clk_ops {
  * - dev_id:	TISCI device ID.
  * - subtype:	Resource assignment subtype that is being requested
  *		from the given device.
- * - range_start:	Start index of the resource range
- * - range_end:		Number of resources in the range
+ * - desc:	Pointer to ti_sci_resource_desc to be updated with the resource
+ *		range start index and number of resources
  */
 struct ti_sci_rm_core_ops {
 	int (*get_range)(const struct ti_sci_handle *handle, u32 dev_id,
-			 u8 subtype, u16 *range_start, u16 *range_num);
+			 u8 subtype, struct ti_sci_resource_desc *desc);
 	int (*get_range_from_shost)(const struct ti_sci_handle *handle,
 				    u32 dev_id, u8 subtype, u8 s_host,
-				    u16 *range_start, u16 *range_num);
+				    struct ti_sci_resource_desc *desc);
 };
 
 #define TI_SCI_RESASG_SUBTYPE_IR_OUTPUT		0
@@ -522,18 +534,6 @@ struct ti_sci_handle {
 
 #define TI_SCI_RESOURCE_NULL	0xffff
 
-/**
- * struct ti_sci_resource_desc - Description of TI SCI resource instance range.
- * @start:	Start index of the resource.
- * @num:	Number of resources.
- * @res_map:	Bitmap to manage the allocation of these resources.
- */
-struct ti_sci_resource_desc {
-	u16 start;
-	u16 num;
-	unsigned long *res_map;
-};
-
 /**
  * struct ti_sci_resource - Structure representing a resource assigned
  *			    to a device.
-- 
cgit v1.2.3


From 519c5c0c558b529f835c9bb30f9a1eb2034d585c Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:03 -0700
Subject: firmware: ti_sci: rm: Add support for second resource range

Sysfw added support for a second range in the resource range API to be able
to describe complex allocations mainly for DMA channels.

Update the ti_sci part to consider the second range as well.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 48 ++++++++++++++++++++++------------
 drivers/firmware/ti_sci.h              |  8 ++++--
 include/linux/soc/ti/ti_sci_protocol.h |  8 ++++--
 3 files changed, 43 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 7a777e91ce3e..2793bb923881 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -1751,11 +1751,14 @@ static int ti_sci_get_resource_range(const struct ti_sci_handle *handle,
 
 	if (!ti_sci_is_response_ack(resp)) {
 		ret = -ENODEV;
-	} else if (!resp->range_start && !resp->range_num) {
+	} else if (!resp->range_num && !resp->range_num_sec) {
+		/* Neither of the two resource range is valid */
 		ret = -ENODEV;
 	} else {
 		desc->start = resp->range_start;
 		desc->num = resp->range_num;
+		desc->start_sec = resp->range_start_sec;
+		desc->num_sec = resp->range_num_sec;
 	};
 
 fail:
@@ -3157,12 +3160,18 @@ u16 ti_sci_get_free_resource(struct ti_sci_resource *res)
 
 	raw_spin_lock_irqsave(&res->lock, flags);
 	for (set = 0; set < res->sets; set++) {
-		free_bit = find_first_zero_bit(res->desc[set].res_map,
-					       res->desc[set].num);
-		if (free_bit != res->desc[set].num) {
-			set_bit(free_bit, res->desc[set].res_map);
+		struct ti_sci_resource_desc *desc = &res->desc[set];
+		int res_count = desc->num + desc->num_sec;
+
+		free_bit = find_first_zero_bit(desc->res_map, res_count);
+		if (free_bit != res_count) {
+			set_bit(free_bit, desc->res_map);
 			raw_spin_unlock_irqrestore(&res->lock, flags);
-			return res->desc[set].start + free_bit;
+
+			if (desc->num && free_bit < desc->num)
+				return desc->start + free_bit;
+			else
+				return desc->start_sec + free_bit;
 		}
 	}
 	raw_spin_unlock_irqrestore(&res->lock, flags);
@@ -3183,10 +3192,14 @@ void ti_sci_release_resource(struct ti_sci_resource *res, u16 id)
 
 	raw_spin_lock_irqsave(&res->lock, flags);
 	for (set = 0; set < res->sets; set++) {
-		if (res->desc[set].start <= id &&
-		    (res->desc[set].num + res->desc[set].start) > id)
-			clear_bit(id - res->desc[set].start,
-				  res->desc[set].res_map);
+		struct ti_sci_resource_desc *desc = &res->desc[set];
+
+		if (desc->num && desc->start <= id &&
+		    (desc->start + desc->num) > id)
+			clear_bit(id - desc->start, desc->res_map);
+		else if (desc->num_sec && desc->start_sec <= id &&
+			 (desc->start_sec + desc->num_sec) > id)
+			clear_bit(id - desc->start_sec, desc->res_map);
 	}
 	raw_spin_unlock_irqrestore(&res->lock, flags);
 }
@@ -3203,7 +3216,7 @@ u32 ti_sci_get_num_resources(struct ti_sci_resource *res)
 	u32 set, count = 0;
 
 	for (set = 0; set < res->sets; set++)
-		count += res->desc[set].num;
+		count += res->desc[set].num + res->desc[set].num_sec;
 
 	return count;
 }
@@ -3227,7 +3240,7 @@ devm_ti_sci_get_resource_sets(const struct ti_sci_handle *handle,
 {
 	struct ti_sci_resource *res;
 	bool valid_set = false;
-	int i, ret;
+	int i, ret, res_count;
 
 	res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL);
 	if (!res)
@@ -3246,18 +3259,19 @@ devm_ti_sci_get_resource_sets(const struct ti_sci_handle *handle,
 		if (ret) {
 			dev_dbg(dev, "dev = %d subtype %d not allocated for this host\n",
 				dev_id, sub_types[i]);
-			res->desc[i].start = 0;
-			res->desc[i].num = 0;
+			memset(&res->desc[i], 0, sizeof(res->desc[i]));
 			continue;
 		}
 
-		dev_dbg(dev, "dev = %d, subtype = %d, start = %d, num = %d\n",
+		dev_dbg(dev, "dev/sub_type: %d/%d, start/num: %d/%d | %d/%d\n",
 			dev_id, sub_types[i], res->desc[i].start,
-			res->desc[i].num);
+			res->desc[i].num, res->desc[i].start_sec,
+			res->desc[i].num_sec);
 
 		valid_set = true;
+		res_count = res->desc[i].num + res->desc[i].num_sec;
 		res->desc[i].res_map =
-			devm_kzalloc(dev, BITS_TO_LONGS(res->desc[i].num) *
+			devm_kzalloc(dev, BITS_TO_LONGS(res_count) *
 				     sizeof(*res->desc[i].res_map), GFP_KERNEL);
 		if (!res->desc[i].res_map)
 			return ERR_PTR(-ENOMEM);
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index dca19ca5fc49..4d980eb592c4 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -574,8 +574,10 @@ struct ti_sci_msg_req_get_resource_range {
 /**
  * struct ti_sci_msg_resp_get_resource_range - Response to resource get range.
  * @hdr:		Generic Header
- * @range_start:	Start index of the resource range.
- * @range_num:		Number of resources in the range.
+ * @range_start:	Start index of the first resource range.
+ * @range_num:		Number of resources in the first range.
+ * @range_start_sec:	Start index of the second resource range.
+ * @range_num_sec:	Number of resources in the second range.
  *
  * Response to request TI_SCI_MSG_GET_RESOURCE_RANGE.
  */
@@ -583,6 +585,8 @@ struct ti_sci_msg_resp_get_resource_range {
 	struct ti_sci_msg_hdr hdr;
 	u16 range_start;
 	u16 range_num;
+	u16 range_start_sec;
+	u16 range_num_sec;
 } __packed;
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 6cd537db4d33..9699b260de59 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -197,13 +197,17 @@ struct ti_sci_clk_ops {
 
 /**
  * struct ti_sci_resource_desc - Description of TI SCI resource instance range.
- * @start:	Start index of the resource.
- * @num:	Number of resources.
+ * @start:	Start index of the first resource range.
+ * @num:	Number of resources in the first range.
+ * @start_sec:	Start index of the second resource range.
+ * @num_sec:	Number of resources in the second range.
  * @res_map:	Bitmap to manage the allocation of these resources.
  */
 struct ti_sci_resource_desc {
 	u16 start;
 	u16 num;
+	u16 start_sec;
+	u16 num_sec;
 	unsigned long *res_map;
 };
 
-- 
cgit v1.2.3


From ce1feed58534d8489afb4900bee75dff15d950e0 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:05 -0700
Subject: firmware: ti_sci: rm: Add support for extended_ch_type for tx channel

Sysfw added 'extended_ch_type' to the tx_ch_cfg_req message which should be
used when BCDMA block copy channels are configured:
extended_ch_type = 0 : the channel is split tx channel (tchan)
extended_ch_type = 1 : the channel is block copy channel (bchan)

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 1 +
 drivers/firmware/ti_sci.h              | 6 ++++++
 include/linux/soc/ti/ti_sci_protocol.h | 5 +++++
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 2793bb923881..0dd3fbb4f964 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2365,6 +2365,7 @@ static int ti_sci_cmd_rm_udmap_tx_ch_cfg(const struct ti_sci_handle *handle,
 	req->tx_sched_priority = params->tx_sched_priority;
 	req->tx_burst_size = params->tx_burst_size;
 	req->tx_tdtype = params->tx_tdtype;
+	req->extended_ch_type = params->extended_ch_type;
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 4d980eb592c4..ca15d8f1f8de 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -915,6 +915,7 @@ struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg {
  *   13 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::fdepth
  *   14 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_burst_size
  *   15 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_tdtype
+ *   16 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::extended_ch_type
  *
  * @nav_id: SoC device ID of Navigator Subsystem where tx channel is located
  *
@@ -983,6 +984,10 @@ struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg {
  * programmed into the tdtype field of the TCHAN_TCFG register:
  * 0 - Return immediately
  * 1 - Wait for completion message from remote peer
+ *
+ * @extended_ch_type: Valid for BCDMA.
+ * 0 - the channel is split tx channel (tchan)
+ * 1 - the channel is block copy channel (bchan)
  */
 struct ti_sci_msg_rm_udmap_tx_ch_cfg_req {
 	struct ti_sci_msg_hdr hdr;
@@ -1005,6 +1010,7 @@ struct ti_sci_msg_rm_udmap_tx_ch_cfg_req {
 	u8 tx_sched_priority;
 	u8 tx_burst_size;
 	u8 tx_tdtype;
+	u8 extended_ch_type;
 } __packed;
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 9699b260de59..6978afc00823 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -336,6 +336,9 @@ struct ti_sci_rm_psil_ops {
 #define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_128_BYTES	2
 #define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES	3
 
+#define TI_SCI_RM_BCDMA_EXTENDED_CH_TYPE_TCHAN		0
+#define TI_SCI_RM_BCDMA_EXTENDED_CH_TYPE_BCHAN		1
+
 /* UDMAP TX/RX channel valid_params common declarations */
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID		BIT(0)
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_ATYPE_VALID                BIT(1)
@@ -362,6 +365,7 @@ struct ti_sci_msg_rm_udmap_tx_ch_cfg {
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_CREDIT_COUNT_VALID      BIT(12)
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FDEPTH_VALID            BIT(13)
 #define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_TDTYPE_VALID            BIT(15)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_EXTENDED_CH_TYPE_VALID	BIT(16)
 	u16 nav_id;
 	u16 index;
 	u8 tx_pause_on_err;
@@ -380,6 +384,7 @@ struct ti_sci_msg_rm_udmap_tx_ch_cfg {
 	u8 tx_sched_priority;
 	u8 tx_burst_size;
 	u8 tx_tdtype;
+	u8 extended_ch_type;
 };
 
 /**
-- 
cgit v1.2.3


From 4d8ddf673a420aa25668eceeb4fbf33e2521fdf2 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:05 -0700
Subject: firmware: ti_sci: rm: Remove ring_get_config support

The ring_get_cfg (0x1111 message) is not used and it is not supported by
sysfw for a long time.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 80 ----------------------------------
 drivers/firmware/ti_sci.h              | 44 -------------------
 include/linux/soc/ti/ti_sci_protocol.h |  6 ---
 3 files changed, 130 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 0dd3fbb4f964..0b801e67e672 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2119,85 +2119,6 @@ fail:
 	return ret;
 }
 
-/**
- * ti_sci_cmd_ring_get_config() - get RA ring configuration
- * @handle:	Pointer to TI SCI handle.
- * @nav_id:	Device ID of Navigator Subsystem from which the ring is
- *		allocated
- * @index:	Ring index
- * @addr_lo:	Returns ring's base address lo 32 bits
- * @addr_hi:	Returns ring's base address hi 32 bits
- * @count:	Returns number of ring elements
- * @mode:	Returns mode of the ring
- * @size:	Returns ring element size
- * @order_id:	Returns ring's bus order ID
- *
- * Return: 0 if all went well, else returns appropriate error value.
- *
- * See @ti_sci_msg_rm_ring_get_cfg_req for more info.
- */
-static int ti_sci_cmd_ring_get_config(const struct ti_sci_handle *handle,
-				      u32 nav_id, u32 index, u8 *mode,
-				      u32 *addr_lo, u32 *addr_hi,
-				      u32 *count, u8 *size, u8 *order_id)
-{
-	struct ti_sci_msg_rm_ring_get_cfg_resp *resp;
-	struct ti_sci_msg_rm_ring_get_cfg_req *req;
-	struct ti_sci_xfer *xfer;
-	struct ti_sci_info *info;
-	struct device *dev;
-	int ret = 0;
-
-	if (IS_ERR_OR_NULL(handle))
-		return -EINVAL;
-
-	info = handle_to_ti_sci_info(handle);
-	dev = info->dev;
-
-	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_GET_CFG,
-				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
-				   sizeof(*req), sizeof(*resp));
-	if (IS_ERR(xfer)) {
-		ret = PTR_ERR(xfer);
-		dev_err(dev,
-			"RM_RA:Message get config failed(%d)\n", ret);
-		return ret;
-	}
-	req = (struct ti_sci_msg_rm_ring_get_cfg_req *)xfer->xfer_buf;
-	req->nav_id = nav_id;
-	req->index = index;
-
-	ret = ti_sci_do_xfer(info, xfer);
-	if (ret) {
-		dev_err(dev, "RM_RA:Mbox get config send fail %d\n", ret);
-		goto fail;
-	}
-
-	resp = (struct ti_sci_msg_rm_ring_get_cfg_resp *)xfer->xfer_buf;
-
-	if (!ti_sci_is_response_ack(resp)) {
-		ret = -ENODEV;
-	} else {
-		if (mode)
-			*mode = resp->mode;
-		if (addr_lo)
-			*addr_lo = resp->addr_lo;
-		if (addr_hi)
-			*addr_hi = resp->addr_hi;
-		if (count)
-			*count = resp->count;
-		if (size)
-			*size = resp->size;
-		if (order_id)
-			*order_id = resp->order_id;
-	};
-
-fail:
-	ti_sci_put_one_xfer(&info->minfo, xfer);
-	dev_dbg(dev, "RM_RA:get config ring %u ret:%d\n", index, ret);
-	return ret;
-}
-
 /**
  * ti_sci_cmd_rm_psil_pair() - Pair PSI-L source to destination thread
  * @handle:	Pointer to TI SCI handle.
@@ -2926,7 +2847,6 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	iops->free_event_map = ti_sci_cmd_free_event_map;
 
 	rops->config = ti_sci_cmd_ring_config;
-	rops->get_config = ti_sci_cmd_ring_get_config;
 
 	psilops->pair = ti_sci_cmd_rm_psil_pair;
 	psilops->unpair = ti_sci_cmd_rm_psil_unpair;
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index ca15d8f1f8de..1cdf918be861 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -49,7 +49,6 @@
 #define TI_SCI_MSG_RM_RING_RECONFIG		0x1102
 #define TI_SCI_MSG_RM_RING_RESET		0x1103
 #define TI_SCI_MSG_RM_RING_CFG			0x1110
-#define TI_SCI_MSG_RM_RING_GET_CFG		0x1111
 
 /* PSI-L requests */
 #define TI_SCI_MSG_RM_PSIL_PAIR			0x1280
@@ -687,49 +686,6 @@ struct ti_sci_msg_rm_ring_cfg_req {
 	u8 order_id;
 } __packed;
 
-/**
- * struct ti_sci_msg_rm_ring_get_cfg_req - Get RA ring's configuration
- *
- * Gets the configuration of the non-real-time register fields of a ring.  The
- * host, or a supervisor of the host, who owns the ring must be the requesting
- * host.  The values of the non-real-time registers are returned in
- * @ti_sci_msg_rm_ring_get_cfg_resp.
- *
- * @hdr: Generic Header
- * @nav_id: Device ID of Navigator Subsystem from which the ring is allocated
- * @index: ring index.
- */
-struct ti_sci_msg_rm_ring_get_cfg_req {
-	struct ti_sci_msg_hdr hdr;
-	u16 nav_id;
-	u16 index;
-} __packed;
-
-/**
- * struct ti_sci_msg_rm_ring_get_cfg_resp -  Ring get configuration response
- *
- * Response received by host processor after RM has handled
- * @ti_sci_msg_rm_ring_get_cfg_req. The response contains the ring's
- * non-real-time register values.
- *
- * @hdr: Generic Header
- * @addr_lo: Ring 32 LSBs of base address
- * @addr_hi: Ring 16 MSBs of base address.
- * @count: Ring number of elements.
- * @mode: Ring mode.
- * @size: encoded Ring element size
- * @order_id: ing order ID.
- */
-struct ti_sci_msg_rm_ring_get_cfg_resp {
-	struct ti_sci_msg_hdr hdr;
-	u32 addr_lo;
-	u32 addr_hi;
-	u32 count;
-	u8 mode;
-	u8 size;
-	u8 order_id;
-} __packed;
-
 /**
  * struct ti_sci_msg_psil_pair - Pairs a PSI-L source thread to a destination
  *				 thread
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 6978afc00823..6710d7ac7a72 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -286,8 +286,6 @@ struct ti_sci_rm_irq_ops {
 /**
  * struct ti_sci_rm_ringacc_ops - Ring Accelerator Management operations
  * @config: configure the SoC Navigator Subsystem Ring Accelerator ring
- * @get_config: get the SoC Navigator Subsystem Ring Accelerator ring
- *		configuration
  */
 struct ti_sci_rm_ringacc_ops {
 	int (*config)(const struct ti_sci_handle *handle,
@@ -295,10 +293,6 @@ struct ti_sci_rm_ringacc_ops {
 		      u32 addr_lo, u32 addr_hi, u32 count, u8 mode,
 		      u8 size, u8 order_id
 	);
-	int (*get_config)(const struct ti_sci_handle *handle,
-			  u32 nav_id, u32 index, u8 *mode,
-			  u32 *addr_lo, u32 *addr_hi, u32 *count,
-			  u8 *size, u8 *order_id);
 };
 
 /**
-- 
cgit v1.2.3


From 3c2017536f3a122bf246cc87f9327e9ec138db92 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:06 -0700
Subject: firmware: ti_sci: rm: Add new ops for ring configuration

The sysfw ring configuration message has been extended to include virtid
and asel value for the ring.
Add the ASEL_VALID to TI_SCI_MSG_VALUE_RM_ALL_NO_ORDER as it is required
for DMA rings.

Instead of extending the current .config() ops - which would need same
patch change in the ringacc driver - add ti_sci_msg_rm_ring_cfg struct and
a new ops using it to configure the ring.

This will allow easy update path in case new members are added for the ring
configuration.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 63 ++++++++++++++++++++++++++++++++++
 drivers/firmware/ti_sci.h              |  7 ++++
 include/linux/soc/ti/ti_sci_protocol.h | 31 ++++++++++++++++-
 3 files changed, 100 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 0b801e67e672..a4d2b318795c 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2119,6 +2119,68 @@ fail:
 	return ret;
 }
 
+/**
+ * ti_sci_cmd_rm_ring_cfg() - Configure a NAVSS ring
+ * @handle:	Pointer to TI SCI handle.
+ * @params:	Pointer to ti_sci_msg_rm_ring_cfg ring config structure
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_ring_cfg and @ti_sci_msg_rm_ring_cfg_req for
+ * more info.
+ */
+static int ti_sci_cmd_rm_ring_cfg(const struct ti_sci_handle *handle,
+				  const struct ti_sci_msg_rm_ring_cfg *params)
+{
+	struct ti_sci_msg_rm_ring_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "RM_RA:Message config failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_ring_cfg_req *)xfer->xfer_buf;
+	req->valid_params = params->valid_params;
+	req->nav_id = params->nav_id;
+	req->index = params->index;
+	req->addr_lo = params->addr_lo;
+	req->addr_hi = params->addr_hi;
+	req->count = params->count;
+	req->mode = params->mode;
+	req->size = params->size;
+	req->order_id = params->order_id;
+	req->virtid = params->virtid;
+	req->asel = params->asel;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "RM_RA:Mbox config send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(dev, "RM_RA:config ring %u ret:%d\n", params->index, ret);
+	return ret;
+}
+
 /**
  * ti_sci_cmd_rm_psil_pair() - Pair PSI-L source to destination thread
  * @handle:	Pointer to TI SCI handle.
@@ -2847,6 +2909,7 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	iops->free_event_map = ti_sci_cmd_free_event_map;
 
 	rops->config = ti_sci_cmd_ring_config;
+	rops->set_cfg = ti_sci_cmd_rm_ring_cfg;
 
 	psilops->pair = ti_sci_cmd_rm_psil_pair;
 	psilops->unpair = ti_sci_cmd_rm_psil_unpair;
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 1cdf918be861..ef3a8214d002 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -659,6 +659,8 @@ struct ti_sci_msg_req_manage_irq {
  *	3 - Valid bit for @tisci_msg_rm_ring_cfg_req mode
  *	4 - Valid bit for @tisci_msg_rm_ring_cfg_req size
  *	5 - Valid bit for @tisci_msg_rm_ring_cfg_req order_id
+ *	6 - Valid bit for @tisci_msg_rm_ring_cfg_req virtid
+ *	7 - Valid bit for @tisci_msg_rm_ring_cfg_req ASEL
  * @nav_id: Device ID of Navigator Subsystem from which the ring is allocated
  * @index: ring index to be configured.
  * @addr_lo: 32 LSBs of ring base address to be programmed into the ring's
@@ -672,6 +674,9 @@ struct ti_sci_msg_req_manage_irq {
  *	the formula (log2(size_bytes) - 2), where size_bytes cannot be
  *	greater than 256.
  * @order_id: Specifies the ring's bus order ID.
+ * @virtid: Ring virt ID value
+ * @asel: Ring ASEL (address select) value to be set into the ASEL field of the
+ *	ring's RING_BA_HI register.
  */
 struct ti_sci_msg_rm_ring_cfg_req {
 	struct ti_sci_msg_hdr hdr;
@@ -684,6 +689,8 @@ struct ti_sci_msg_rm_ring_cfg_req {
 	u8 mode;
 	u8 size;
 	u8 order_id;
+	u16 virtid;
+	u8 asel;
 } __packed;
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 6710d7ac7a72..d1711050cd9d 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -275,17 +275,44 @@ struct ti_sci_rm_irq_ops {
 #define TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID	BIT(4)
 /* RA config.order_id parameter is valid for RM ring configure TISCI message */
 #define TI_SCI_MSG_VALUE_RM_RING_ORDER_ID_VALID	BIT(5)
+/* RA config.virtid parameter is valid for RM ring configure TISCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_VIRTID_VALID	BIT(6)
+/* RA config.asel parameter is valid for RM ring configure TISCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_ASEL_VALID	BIT(7)
 
 #define TI_SCI_MSG_VALUE_RM_ALL_NO_ORDER \
 	(TI_SCI_MSG_VALUE_RM_RING_ADDR_LO_VALID | \
 	TI_SCI_MSG_VALUE_RM_RING_ADDR_HI_VALID | \
 	TI_SCI_MSG_VALUE_RM_RING_COUNT_VALID | \
 	TI_SCI_MSG_VALUE_RM_RING_MODE_VALID | \
-	TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID)
+	TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_ASEL_VALID)
+
+/**
+ * struct ti_sci_msg_rm_ring_cfg - Ring configuration
+ *
+ * Parameters for Navigator Subsystem ring configuration
+ * See @ti_sci_msg_rm_ring_cfg_req
+ */
+struct ti_sci_msg_rm_ring_cfg {
+	u32 valid_params;
+	u16 nav_id;
+	u16 index;
+	u32 addr_lo;
+	u32 addr_hi;
+	u32 count;
+	u8 mode;
+	u8 size;
+	u8 order_id;
+	u16 virtid;
+	u8 asel;
+};
 
 /**
  * struct ti_sci_rm_ringacc_ops - Ring Accelerator Management operations
  * @config: configure the SoC Navigator Subsystem Ring Accelerator ring
+ *	    Deprecated
+ * @set_cfg: configure the SoC Navigator Subsystem Ring Accelerator ring
  */
 struct ti_sci_rm_ringacc_ops {
 	int (*config)(const struct ti_sci_handle *handle,
@@ -293,6 +320,8 @@ struct ti_sci_rm_ringacc_ops {
 		      u32 addr_lo, u32 addr_hi, u32 count, u8 mode,
 		      u8 size, u8 order_id
 	);
+	int (*set_cfg)(const struct ti_sci_handle *handle,
+		       const struct ti_sci_msg_rm_ring_cfg *params);
 };
 
 /**
-- 
cgit v1.2.3


From fed7552f1e69296461fca62ebaa0bb5a06fec0df Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:07 -0700
Subject: firmware: ti_sci: rm: Remove unused config() from
 ti_sci_rm_ringacc_ops

The ringacc driver has been converted to use the new set_cfg function to
configure the ring, the old config ops can be removed.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 72 ----------------------------------
 include/linux/soc/ti/ti_sci_protocol.h |  7 ----
 2 files changed, 79 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index a4d2b318795c..235c7e7869aa 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2048,77 +2048,6 @@ static int ti_sci_cmd_free_event_map(const struct ti_sci_handle *handle,
 			       ia_id, vint, global_event, vint_status_bit, 0);
 }
 
-/**
- * ti_sci_cmd_ring_config() - configure RA ring
- * @handle:		Pointer to TI SCI handle.
- * @valid_params:	Bitfield defining validity of ring configuration
- *			parameters
- * @nav_id:		Device ID of Navigator Subsystem from which the ring is
- *			allocated
- * @index:		Ring index
- * @addr_lo:		The ring base address lo 32 bits
- * @addr_hi:		The ring base address hi 32 bits
- * @count:		Number of ring elements
- * @mode:		The mode of the ring
- * @size:		The ring element size.
- * @order_id:		Specifies the ring's bus order ID
- *
- * Return: 0 if all went well, else returns appropriate error value.
- *
- * See @ti_sci_msg_rm_ring_cfg_req for more info.
- */
-static int ti_sci_cmd_ring_config(const struct ti_sci_handle *handle,
-				  u32 valid_params, u16 nav_id, u16 index,
-				  u32 addr_lo, u32 addr_hi, u32 count,
-				  u8 mode, u8 size, u8 order_id)
-{
-	struct ti_sci_msg_rm_ring_cfg_req *req;
-	struct ti_sci_msg_hdr *resp;
-	struct ti_sci_xfer *xfer;
-	struct ti_sci_info *info;
-	struct device *dev;
-	int ret = 0;
-
-	if (IS_ERR_OR_NULL(handle))
-		return -EINVAL;
-
-	info = handle_to_ti_sci_info(handle);
-	dev = info->dev;
-
-	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_CFG,
-				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
-				   sizeof(*req), sizeof(*resp));
-	if (IS_ERR(xfer)) {
-		ret = PTR_ERR(xfer);
-		dev_err(dev, "RM_RA:Message config failed(%d)\n", ret);
-		return ret;
-	}
-	req = (struct ti_sci_msg_rm_ring_cfg_req *)xfer->xfer_buf;
-	req->valid_params = valid_params;
-	req->nav_id = nav_id;
-	req->index = index;
-	req->addr_lo = addr_lo;
-	req->addr_hi = addr_hi;
-	req->count = count;
-	req->mode = mode;
-	req->size = size;
-	req->order_id = order_id;
-
-	ret = ti_sci_do_xfer(info, xfer);
-	if (ret) {
-		dev_err(dev, "RM_RA:Mbox config send fail %d\n", ret);
-		goto fail;
-	}
-
-	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
-	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
-
-fail:
-	ti_sci_put_one_xfer(&info->minfo, xfer);
-	dev_dbg(dev, "RM_RA:config ring %u ret:%d\n", index, ret);
-	return ret;
-}
-
 /**
  * ti_sci_cmd_rm_ring_cfg() - Configure a NAVSS ring
  * @handle:	Pointer to TI SCI handle.
@@ -2908,7 +2837,6 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	iops->free_irq = ti_sci_cmd_free_irq;
 	iops->free_event_map = ti_sci_cmd_free_event_map;
 
-	rops->config = ti_sci_cmd_ring_config;
 	rops->set_cfg = ti_sci_cmd_rm_ring_cfg;
 
 	psilops->pair = ti_sci_cmd_rm_psil_pair;
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index d1711050cd9d..0aad7009b50e 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -310,16 +310,9 @@ struct ti_sci_msg_rm_ring_cfg {
 
 /**
  * struct ti_sci_rm_ringacc_ops - Ring Accelerator Management operations
- * @config: configure the SoC Navigator Subsystem Ring Accelerator ring
- *	    Deprecated
  * @set_cfg: configure the SoC Navigator Subsystem Ring Accelerator ring
  */
 struct ti_sci_rm_ringacc_ops {
-	int (*config)(const struct ti_sci_handle *handle,
-		      u32 valid_params, u16 nav_id, u16 index,
-		      u32 addr_lo, u32 addr_hi, u32 count, u8 mode,
-		      u8 size, u8 order_id
-	);
 	int (*set_cfg)(const struct ti_sci_handle *handle,
 		       const struct ti_sci_msg_rm_ring_cfg *params);
 };
-- 
cgit v1.2.3


From 8c42379e40e2db4199ceeb6a6ef9fff73ff132cf Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Sun, 25 Oct 2020 12:10:22 -0700
Subject: soc: ti: k3-ringacc: Use correct device for allocation in RING mode

In RING mode the ringacc does not access the ring memory. In this access
mode the ringacc coherency does not have meaning.

If the ring is configured in RING mode, then the ringacc itself will not
access to the ring memory. Only the requester (user) of the ring is going
to read/write to the memory.
Extend the ring configuration parameters with a device pointer to be used
for DMA API when the ring is configured in RING mode.

Extending the ring configuration struct will allow per ring selection of
device to be used for allocation, thus allowing per ring coherency.

To avoid regression, fall back to use the ringacc dev in case the alloc_dev
is not provided.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/soc/ti/k3-ringacc.c       | 18 +++++++++++++-----
 include/linux/soc/ti/k3-ringacc.h |  5 +++++
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/ti/k3-ringacc.c b/drivers/soc/ti/k3-ringacc.c
index 9ddd77113c5a..7fdb688452f7 100644
--- a/drivers/soc/ti/k3-ringacc.c
+++ b/drivers/soc/ti/k3-ringacc.c
@@ -141,6 +141,7 @@ struct k3_ring_state {
  * @parent: Pointer on struct @k3_ringacc
  * @use_count: Use count for shared rings
  * @proxy_id: RA Ring Proxy Id (only if @K3_RINGACC_RING_USE_PROXY)
+ * @dma_dev: device to be used for DMA API (allocation, mapping)
  */
 struct k3_ring {
 	struct k3_ring_rt_regs __iomem *rt;
@@ -160,6 +161,7 @@ struct k3_ring {
 	struct k3_ringacc	*parent;
 	u32		use_count;
 	int		proxy_id;
+	struct device	*dma_dev;
 };
 
 struct k3_ringacc_ops {
@@ -508,11 +510,12 @@ int k3_ringacc_ring_free(struct k3_ring *ring)
 
 	k3_ringacc_ring_free_sci(ring);
 
-	dma_free_coherent(ringacc->dev,
+	dma_free_coherent(ring->dma_dev,
 			  ring->size * (4 << ring->elm_size),
 			  ring->ring_mem_virt, ring->ring_mem_dma);
 	ring->flags = 0;
 	ring->ops = NULL;
+	ring->dma_dev = NULL;
 	if (ring->proxy_id != K3_RINGACC_PROXY_NOT_USED) {
 		clear_bit(ring->proxy_id, ringacc->proxy_inuse);
 		ring->proxy = NULL;
@@ -633,8 +636,12 @@ int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
 	switch (ring->mode) {
 	case K3_RINGACC_RING_MODE_RING:
 		ring->ops = &k3_ring_mode_ring_ops;
+		ring->dma_dev = cfg->dma_dev;
+		if (!ring->dma_dev)
+			ring->dma_dev = ringacc->dev;
 		break;
 	case K3_RINGACC_RING_MODE_MESSAGE:
+		ring->dma_dev = ringacc->dev;
 		if (ring->proxy)
 			ring->ops = &k3_ring_mode_proxy_ops;
 		else
@@ -646,9 +653,9 @@ int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
 		goto err_free_proxy;
 	}
 
-	ring->ring_mem_virt = dma_alloc_coherent(ringacc->dev,
-					ring->size * (4 << ring->elm_size),
-					&ring->ring_mem_dma, GFP_KERNEL);
+	ring->ring_mem_virt = dma_alloc_coherent(ring->dma_dev,
+						 ring->size * (4 << ring->elm_size),
+						 &ring->ring_mem_dma, GFP_KERNEL);
 	if (!ring->ring_mem_virt) {
 		dev_err(ringacc->dev, "Failed to alloc ring mem\n");
 		ret = -ENOMEM;
@@ -669,12 +676,13 @@ int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
 	return 0;
 
 err_free_mem:
-	dma_free_coherent(ringacc->dev,
+	dma_free_coherent(ring->dma_dev,
 			  ring->size * (4 << ring->elm_size),
 			  ring->ring_mem_virt,
 			  ring->ring_mem_dma);
 err_free_ops:
 	ring->ops = NULL;
+	ring->dma_dev = NULL;
 err_free_proxy:
 	ring->proxy = NULL;
 	return ret;
diff --git a/include/linux/soc/ti/k3-ringacc.h b/include/linux/soc/ti/k3-ringacc.h
index 5a472eca5ee4..658dc71d2901 100644
--- a/include/linux/soc/ti/k3-ringacc.h
+++ b/include/linux/soc/ti/k3-ringacc.h
@@ -67,6 +67,9 @@ struct k3_ring;
  *	 few times. It's usable when the same ring is used as Free Host PD ring
  *	 for different flows, for example.
  *	 Note: Locking should be done by consumer if required
+ * @dma_dev: Master device which is using and accessing to the ring
+ *	memory when the mode is K3_RINGACC_RING_MODE_RING. Memory allocations
+ *	should be done using this device.
  */
 struct k3_ring_cfg {
 	u32 size;
@@ -74,6 +77,8 @@ struct k3_ring_cfg {
 	enum k3_ring_mode mode;
 #define K3_RINGACC_RING_SHARED BIT(1)
 	u32 flags;
+
+	struct device *dma_dev;
 };
 
 #define K3_RINGACC_RING_ID_ANY (-1)
-- 
cgit v1.2.3


From 5190db9fdd20fa5ba6084c98a3bc71c2fdf6a871 Mon Sep 17 00:00:00 2001
From: Roman Anufriev <dotdot@yandex-team.ru>
Date: Sun, 18 Oct 2020 05:56:54 +0300
Subject: fs/quota: update quota state flags scheme with project quota flags

Current quota state flags scheme doesn't include project quota and thus
shows all flags after DQUOT_USAGE_ENABLED wrong. Fix this and also add
DQUOT_NOLIST_DIRTY to the scheme.

Link: https://lore.kernel.org/r/1602989814-28922-1-git-send-email-dotdot@yandex-team.ru
Signed-off-by: Roman Anufriev <dotdot@yandex-team.ru>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/quota.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/quota.h b/include/linux/quota.h
index 27aab84fcbaa..18ebd39c9487 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -448,17 +448,18 @@ struct quota_format_type {
 };
 
 /**
- * Quota state flags - they actually come in two flavors - for users and groups.
+ * Quota state flags - they come in three flavors - for users, groups and projects.
  *
  * Actual typed flags layout:
- *				USRQUOTA	GRPQUOTA
- *  DQUOT_USAGE_ENABLED		0x0001		0x0002
- *  DQUOT_LIMITS_ENABLED	0x0004		0x0008
- *  DQUOT_SUSPENDED		0x0010		0x0020
+ *				USRQUOTA	GRPQUOTA	PRJQUOTA
+ *  DQUOT_USAGE_ENABLED		0x0001		0x0002		0x0004
+ *  DQUOT_LIMITS_ENABLED	0x0008		0x0010		0x0020
+ *  DQUOT_SUSPENDED		0x0040		0x0080		0x0100
  *
  * Following bits are used for non-typed flags:
- *  DQUOT_QUOTA_SYS_FILE	0x0040
- *  DQUOT_NEGATIVE_USAGE	0x0080
+ *  DQUOT_QUOTA_SYS_FILE	0x0200
+ *  DQUOT_NEGATIVE_USAGE	0x0400
+ *  DQUOT_NOLIST_DIRTY		0x0800
  */
 enum {
 	_DQUOT_USAGE_ENABLED = 0,		/* Track disk usage for users */
-- 
cgit v1.2.3


From e1ac4b2406d94eddce8ac2c5ab4235f6075a9602 Mon Sep 17 00:00:00 2001
From: Chester Lin <clin@suse.com>
Date: Fri, 30 Oct 2020 14:08:38 +0800
Subject: efi: generalize efi_get_secureboot

Generalize the efi_get_secureboot() function so not only efistub but also
other subsystems can use it.

Note that the MokSbState handling is not factored out: the variable is
boot time only, and so it cannot be parameterized as easily. Also, the
IMA code will switch to this version in a future patch, and it does not
incorporate the MokSbState exception in the first place.

Note that the new efi_get_secureboot_mode() helper treats any failures
to read SetupMode as setup mode being disabled.

Co-developed-by: Chester Lin <clin@suse.com>
Signed-off-by: Chester Lin <clin@suse.com>
Acked-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/x86/boot/compressed/Makefile         |  2 +-
 drivers/firmware/efi/libstub/efistub.h    |  2 ++
 drivers/firmware/efi/libstub/secureboot.c | 41 +++++++++++--------------------
 include/linux/efi.h                       | 23 ++++++++++++++++-
 4 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index ee249088cbfe..8d358a6fe6ec 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -35,7 +35,7 @@ cflags-$(CONFIG_X86_32) := -march=i386
 cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone
 KBUILD_CFLAGS += $(cflags-y)
 KBUILD_CFLAGS += -mno-mmx -mno-sse
-KBUILD_CFLAGS += -ffreestanding
+KBUILD_CFLAGS += -ffreestanding -fshort-wchar
 KBUILD_CFLAGS += -fno-stack-protector
 KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
 KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 2d7abcd99de9..b8ec29d6a74a 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -848,4 +848,6 @@ asmlinkage void __noreturn efi_enter_kernel(unsigned long entrypoint,
 
 void efi_handle_post_ebs_state(void);
 
+enum efi_secureboot_mode efi_get_secureboot(void);
+
 #endif
diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c
index 5efc524b14be..af18d86c1604 100644
--- a/drivers/firmware/efi/libstub/secureboot.c
+++ b/drivers/firmware/efi/libstub/secureboot.c
@@ -12,15 +12,16 @@
 
 #include "efistub.h"
 
-/* BIOS variables */
-static const efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
-static const efi_char16_t efi_SecureBoot_name[] = L"SecureBoot";
-static const efi_char16_t efi_SetupMode_name[] = L"SetupMode";
-
 /* SHIM variables */
 static const efi_guid_t shim_guid = EFI_SHIM_LOCK_GUID;
 static const efi_char16_t shim_MokSBState_name[] = L"MokSBState";
 
+static efi_status_t get_var(efi_char16_t *name, efi_guid_t *vendor, u32 *attr,
+			    unsigned long *data_size, void *data)
+{
+	return get_efi_var(name, vendor, attr, data_size, data);
+}
+
 /*
  * Determine whether we're in secure boot mode.
  *
@@ -30,26 +31,18 @@ static const efi_char16_t shim_MokSBState_name[] = L"MokSBState";
 enum efi_secureboot_mode efi_get_secureboot(void)
 {
 	u32 attr;
-	u8 secboot, setupmode, moksbstate;
 	unsigned long size;
+	enum efi_secureboot_mode mode;
 	efi_status_t status;
+	u8 moksbstate;
 
-	size = sizeof(secboot);
-	status = get_efi_var(efi_SecureBoot_name, &efi_variable_guid,
-			     NULL, &size, &secboot);
-	if (status == EFI_NOT_FOUND)
-		return efi_secureboot_mode_disabled;
-	if (status != EFI_SUCCESS)
-		goto out_efi_err;
-
-	size = sizeof(setupmode);
-	status = get_efi_var(efi_SetupMode_name, &efi_variable_guid,
-			     NULL, &size, &setupmode);
-	if (status != EFI_SUCCESS)
-		goto out_efi_err;
-
-	if (secboot == 0 || setupmode == 1)
-		return efi_secureboot_mode_disabled;
+	mode = efi_get_secureboot_mode(get_var);
+	if (mode == efi_secureboot_mode_unknown) {
+		efi_err("Could not determine UEFI Secure Boot status.\n");
+		return efi_secureboot_mode_unknown;
+	}
+	if (mode != efi_secureboot_mode_enabled)
+		return mode;
 
 	/*
 	 * See if a user has put the shim into insecure mode. If so, and if the
@@ -69,8 +62,4 @@ enum efi_secureboot_mode efi_get_secureboot(void)
 secure_boot_enabled:
 	efi_info("UEFI Secure Boot is enabled.\n");
 	return efi_secureboot_mode_enabled;
-
-out_efi_err:
-	efi_err("Could not determine UEFI Secure Boot status.\n");
-	return efi_secureboot_mode_unknown;
 }
diff --git a/include/linux/efi.h b/include/linux/efi.h
index d7c0e73af2b9..1cd5d91d8ca1 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1089,7 +1089,28 @@ enum efi_secureboot_mode {
 	efi_secureboot_mode_disabled,
 	efi_secureboot_mode_enabled,
 };
-enum efi_secureboot_mode efi_get_secureboot(void);
+
+static inline
+enum efi_secureboot_mode efi_get_secureboot_mode(efi_get_variable_t *get_var)
+{
+	u8 secboot, setupmode = 0;
+	efi_status_t status;
+	unsigned long size;
+
+	size = sizeof(secboot);
+	status = get_var(L"SecureBoot", &EFI_GLOBAL_VARIABLE_GUID, NULL, &size,
+			 &secboot);
+	if (status == EFI_NOT_FOUND)
+		return efi_secureboot_mode_disabled;
+	if (status != EFI_SUCCESS)
+		return efi_secureboot_mode_unknown;
+
+	size = sizeof(setupmode);
+	get_var(L"SetupMode", &EFI_GLOBAL_VARIABLE_GUID, NULL, &size, &setupmode);
+	if (secboot == 0 || setupmode == 1)
+		return efi_secureboot_mode_disabled;
+	return efi_secureboot_mode_enabled;
+}
 
 #ifdef CONFIG_RESET_ATTACK_MITIGATION
 void efi_enable_reset_attack_mitigation(void);
-- 
cgit v1.2.3


From 9d1c94a69d70f1b02bdf06b231cd16ad47ef06cd Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Fri, 23 Oct 2020 18:33:25 +0200
Subject: clk: fix a kernel-doc markup

clk_get_duty_cycle -> clk_get_scaled_duty_cycle

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Link: https://lore.kernel.org/r/b2336f3f3cdfe6e1a2d3a7a056ab7ccc7a81b945.1603469755.git.mchehab+huawei@kernel.org
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clk.h b/include/linux/clk.h
index 7fd6a1febcf4..5f8d5f4931c0 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -150,7 +150,7 @@ int clk_get_phase(struct clk *clk);
 int clk_set_duty_cycle(struct clk *clk, unsigned int num, unsigned int den);
 
 /**
- * clk_get_duty_cycle - return the duty cycle ratio of a clock signal
+ * clk_get_scaled_duty_cycle - return the duty cycle ratio of a clock signal
  * @clk: clock signal source
  * @scale: scaling factor to be applied to represent the ratio as an integer
  *
-- 
cgit v1.2.3


From 0264c8c9e1b53e9dbb41fae5e54756e84644bc60 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 5 Nov 2020 21:32:36 -0500
Subject: ftrace: Move the recursion testing into global headers

Currently, if a callback is registered to a ftrace function and its
ftrace_ops does not have the RECURSION flag set, it is encapsulated in a
helper function that does the recursion for it.

Really, all the callbacks should have their own recursion protection for
performance reasons. But they should not all implement their own. Move the
recursion helpers to global headers, so that all callbacks can use them.

Link: https://lkml.kernel.org/r/20201028115612.460535535@goodmis.org
Link: https://lkml.kernel.org/r/20201106023546.166456258@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h          |   1 +
 include/linux/trace_recursion.h | 187 ++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h            | 177 -------------------------------------
 3 files changed, 188 insertions(+), 177 deletions(-)
 create mode 100644 include/linux/trace_recursion.h

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1bd3a0356ae4..0e4164a7f56d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -7,6 +7,7 @@
 #ifndef _LINUX_FTRACE_H
 #define _LINUX_FTRACE_H
 
+#include <linux/trace_recursion.h>
 #include <linux/trace_clock.h>
 #include <linux/kallsyms.h>
 #include <linux/linkage.h>
diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
new file mode 100644
index 000000000000..dbb7b6d4c94c
--- /dev/null
+++ b/include/linux/trace_recursion.h
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_TRACE_RECURSION_H
+#define _LINUX_TRACE_RECURSION_H
+
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_TRACING
+
+/* Only current can touch trace_recursion */
+
+/*
+ * For function tracing recursion:
+ *  The order of these bits are important.
+ *
+ *  When function tracing occurs, the following steps are made:
+ *   If arch does not support a ftrace feature:
+ *    call internal function (uses INTERNAL bits) which calls...
+ *   If callback is registered to the "global" list, the list
+ *    function is called and recursion checks the GLOBAL bits.
+ *    then this function calls...
+ *   The function callback, which can use the FTRACE bits to
+ *    check for recursion.
+ *
+ * Now if the arch does not support a feature, and it calls
+ * the global list function which calls the ftrace callback
+ * all three of these steps will do a recursion protection.
+ * There's no reason to do one if the previous caller already
+ * did. The recursion that we are protecting against will
+ * go through the same steps again.
+ *
+ * To prevent the multiple recursion checks, if a recursion
+ * bit is set that is higher than the MAX bit of the current
+ * check, then we know that the check was made by the previous
+ * caller, and we can skip the current check.
+ */
+enum {
+	/* Function recursion bits */
+	TRACE_FTRACE_BIT,
+	TRACE_FTRACE_NMI_BIT,
+	TRACE_FTRACE_IRQ_BIT,
+	TRACE_FTRACE_SIRQ_BIT,
+
+	/* INTERNAL_BITs must be greater than FTRACE_BITs */
+	TRACE_INTERNAL_BIT,
+	TRACE_INTERNAL_NMI_BIT,
+	TRACE_INTERNAL_IRQ_BIT,
+	TRACE_INTERNAL_SIRQ_BIT,
+
+	TRACE_BRANCH_BIT,
+/*
+ * Abuse of the trace_recursion.
+ * As we need a way to maintain state if we are tracing the function
+ * graph in irq because we want to trace a particular function that
+ * was called in irq context but we have irq tracing off. Since this
+ * can only be modified by current, we can reuse trace_recursion.
+ */
+	TRACE_IRQ_BIT,
+
+	/* Set if the function is in the set_graph_function file */
+	TRACE_GRAPH_BIT,
+
+	/*
+	 * In the very unlikely case that an interrupt came in
+	 * at a start of graph tracing, and we want to trace
+	 * the function in that interrupt, the depth can be greater
+	 * than zero, because of the preempted start of a previous
+	 * trace. In an even more unlikely case, depth could be 2
+	 * if a softirq interrupted the start of graph tracing,
+	 * followed by an interrupt preempting a start of graph
+	 * tracing in the softirq, and depth can even be 3
+	 * if an NMI came in at the start of an interrupt function
+	 * that preempted a softirq start of a function that
+	 * preempted normal context!!!! Luckily, it can't be
+	 * greater than 3, so the next two bits are a mask
+	 * of what the depth is when we set TRACE_GRAPH_BIT
+	 */
+
+	TRACE_GRAPH_DEPTH_START_BIT,
+	TRACE_GRAPH_DEPTH_END_BIT,
+
+	/*
+	 * To implement set_graph_notrace, if this bit is set, we ignore
+	 * function graph tracing of called functions, until the return
+	 * function is called to clear it.
+	 */
+	TRACE_GRAPH_NOTRACE_BIT,
+
+	/*
+	 * When transitioning between context, the preempt_count() may
+	 * not be correct. Allow for a single recursion to cover this case.
+	 */
+	TRACE_TRANSITION_BIT,
+};
+
+#define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0)
+#define trace_recursion_clear(bit)	do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
+#define trace_recursion_test(bit)	((current)->trace_recursion & (1<<(bit)))
+
+#define trace_recursion_depth() \
+	(((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3)
+#define trace_recursion_set_depth(depth) \
+	do {								\
+		current->trace_recursion &=				\
+			~(3 << TRACE_GRAPH_DEPTH_START_BIT);		\
+		current->trace_recursion |=				\
+			((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT;	\
+	} while (0)
+
+#define TRACE_CONTEXT_BITS	4
+
+#define TRACE_FTRACE_START	TRACE_FTRACE_BIT
+#define TRACE_FTRACE_MAX	((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_LIST_START	TRACE_INTERNAL_BIT
+#define TRACE_LIST_MAX		((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
+
+#define TRACE_CONTEXT_MASK	TRACE_LIST_MAX
+
+static __always_inline int trace_get_context_bit(void)
+{
+	int bit;
+
+	if (in_interrupt()) {
+		if (in_nmi())
+			bit = 0;
+
+		else if (in_irq())
+			bit = 1;
+		else
+			bit = 2;
+	} else
+		bit = 3;
+
+	return bit;
+}
+
+static __always_inline int trace_test_and_set_recursion(int start, int max)
+{
+	unsigned int val = current->trace_recursion;
+	int bit;
+
+	/* A previous recursion check was made */
+	if ((val & TRACE_CONTEXT_MASK) > max)
+		return 0;
+
+	bit = trace_get_context_bit() + start;
+	if (unlikely(val & (1 << bit))) {
+		/*
+		 * It could be that preempt_count has not been updated during
+		 * a switch between contexts. Allow for a single recursion.
+		 */
+		bit = TRACE_TRANSITION_BIT;
+		if (trace_recursion_test(bit))
+			return -1;
+		trace_recursion_set(bit);
+		barrier();
+		return bit + 1;
+	}
+
+	/* Normal check passed, clear the transition to allow it again */
+	trace_recursion_clear(TRACE_TRANSITION_BIT);
+
+	val |= 1 << bit;
+	current->trace_recursion = val;
+	barrier();
+
+	return bit + 1;
+}
+
+static __always_inline void trace_clear_recursion(int bit)
+{
+	unsigned int val = current->trace_recursion;
+
+	if (!bit)
+		return;
+
+	bit--;
+	bit = 1 << bit;
+	val &= ~bit;
+
+	barrier();
+	current->trace_recursion = val;
+}
+
+#endif /* CONFIG_TRACING */
+#endif /* _LINUX_TRACE_RECURSION_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1dadef445cd1..9462251cab92 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -558,183 +558,6 @@ struct tracer {
 	bool			noboot;
 };
 
-
-/* Only current can touch trace_recursion */
-
-/*
- * For function tracing recursion:
- *  The order of these bits are important.
- *
- *  When function tracing occurs, the following steps are made:
- *   If arch does not support a ftrace feature:
- *    call internal function (uses INTERNAL bits) which calls...
- *   If callback is registered to the "global" list, the list
- *    function is called and recursion checks the GLOBAL bits.
- *    then this function calls...
- *   The function callback, which can use the FTRACE bits to
- *    check for recursion.
- *
- * Now if the arch does not support a feature, and it calls
- * the global list function which calls the ftrace callback
- * all three of these steps will do a recursion protection.
- * There's no reason to do one if the previous caller already
- * did. The recursion that we are protecting against will
- * go through the same steps again.
- *
- * To prevent the multiple recursion checks, if a recursion
- * bit is set that is higher than the MAX bit of the current
- * check, then we know that the check was made by the previous
- * caller, and we can skip the current check.
- */
-enum {
-	/* Function recursion bits */
-	TRACE_FTRACE_BIT,
-	TRACE_FTRACE_NMI_BIT,
-	TRACE_FTRACE_IRQ_BIT,
-	TRACE_FTRACE_SIRQ_BIT,
-
-	/* INTERNAL_BITs must be greater than FTRACE_BITs */
-	TRACE_INTERNAL_BIT,
-	TRACE_INTERNAL_NMI_BIT,
-	TRACE_INTERNAL_IRQ_BIT,
-	TRACE_INTERNAL_SIRQ_BIT,
-
-	TRACE_BRANCH_BIT,
-/*
- * Abuse of the trace_recursion.
- * As we need a way to maintain state if we are tracing the function
- * graph in irq because we want to trace a particular function that
- * was called in irq context but we have irq tracing off. Since this
- * can only be modified by current, we can reuse trace_recursion.
- */
-	TRACE_IRQ_BIT,
-
-	/* Set if the function is in the set_graph_function file */
-	TRACE_GRAPH_BIT,
-
-	/*
-	 * In the very unlikely case that an interrupt came in
-	 * at a start of graph tracing, and we want to trace
-	 * the function in that interrupt, the depth can be greater
-	 * than zero, because of the preempted start of a previous
-	 * trace. In an even more unlikely case, depth could be 2
-	 * if a softirq interrupted the start of graph tracing,
-	 * followed by an interrupt preempting a start of graph
-	 * tracing in the softirq, and depth can even be 3
-	 * if an NMI came in at the start of an interrupt function
-	 * that preempted a softirq start of a function that
-	 * preempted normal context!!!! Luckily, it can't be
-	 * greater than 3, so the next two bits are a mask
-	 * of what the depth is when we set TRACE_GRAPH_BIT
-	 */
-
-	TRACE_GRAPH_DEPTH_START_BIT,
-	TRACE_GRAPH_DEPTH_END_BIT,
-
-	/*
-	 * To implement set_graph_notrace, if this bit is set, we ignore
-	 * function graph tracing of called functions, until the return
-	 * function is called to clear it.
-	 */
-	TRACE_GRAPH_NOTRACE_BIT,
-
-	/*
-	 * When transitioning between context, the preempt_count() may
-	 * not be correct. Allow for a single recursion to cover this case.
-	 */
-	TRACE_TRANSITION_BIT,
-};
-
-#define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0)
-#define trace_recursion_clear(bit)	do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
-#define trace_recursion_test(bit)	((current)->trace_recursion & (1<<(bit)))
-
-#define trace_recursion_depth() \
-	(((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3)
-#define trace_recursion_set_depth(depth) \
-	do {								\
-		current->trace_recursion &=				\
-			~(3 << TRACE_GRAPH_DEPTH_START_BIT);		\
-		current->trace_recursion |=				\
-			((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT;	\
-	} while (0)
-
-#define TRACE_CONTEXT_BITS	4
-
-#define TRACE_FTRACE_START	TRACE_FTRACE_BIT
-#define TRACE_FTRACE_MAX	((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
-
-#define TRACE_LIST_START	TRACE_INTERNAL_BIT
-#define TRACE_LIST_MAX		((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
-
-#define TRACE_CONTEXT_MASK	TRACE_LIST_MAX
-
-static __always_inline int trace_get_context_bit(void)
-{
-	int bit;
-
-	if (in_interrupt()) {
-		if (in_nmi())
-			bit = 0;
-
-		else if (in_irq())
-			bit = 1;
-		else
-			bit = 2;
-	} else
-		bit = 3;
-
-	return bit;
-}
-
-static __always_inline int trace_test_and_set_recursion(int start, int max)
-{
-	unsigned int val = current->trace_recursion;
-	int bit;
-
-	/* A previous recursion check was made */
-	if ((val & TRACE_CONTEXT_MASK) > max)
-		return 0;
-
-	bit = trace_get_context_bit() + start;
-	if (unlikely(val & (1 << bit))) {
-		/*
-		 * It could be that preempt_count has not been updated during
-		 * a switch between contexts. Allow for a single recursion.
-		 */
-		bit = TRACE_TRANSITION_BIT;
-		if (trace_recursion_test(bit))
-			return -1;
-		trace_recursion_set(bit);
-		barrier();
-		return bit + 1;
-	}
-
-	/* Normal check passed, clear the transition to allow it again */
-	trace_recursion_clear(TRACE_TRANSITION_BIT);
-
-	val |= 1 << bit;
-	current->trace_recursion = val;
-	barrier();
-
-	return bit + 1;
-}
-
-static __always_inline void trace_clear_recursion(int bit)
-{
-	unsigned int val = current->trace_recursion;
-
-	if (!bit)
-		return;
-
-	bit--;
-	bit = 1 << bit;
-	val &= ~bit;
-
-	barrier();
-	current->trace_recursion = val;
-}
-
 static inline struct ring_buffer_iter *
 trace_buffer_iter(struct trace_iterator *iter, int cpu)
 {
-- 
cgit v1.2.3


From 6e4eb9cb22fc8a893cb708ed42644de5ee7c3827 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 5 Nov 2020 21:32:37 -0500
Subject: ftrace: Add ftrace_test_recursion_trylock() helper function

To make it easier for ftrace callbacks to have recursion protection, provide
a ftrace_test_recursion_trylock() and ftrace_test_recursion_unlock() helper
that tests for recursion.

Link: https://lkml.kernel.org/r/20201028115612.634927593@goodmis.org
Link: https://lkml.kernel.org/r/20201106023546.378584067@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_recursion.h | 25 +++++++++++++++++++++++++
 kernel/trace/trace_functions.c  | 12 +++++-------
 2 files changed, 30 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index dbb7b6d4c94c..f2a949dbfec7 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -183,5 +183,30 @@ static __always_inline void trace_clear_recursion(int bit)
 	current->trace_recursion = val;
 }
 
+/**
+ * ftrace_test_recursion_trylock - tests for recursion in same context
+ *
+ * Use this for ftrace callbacks. This will detect if the function
+ * tracing recursed in the same context (normal vs interrupt),
+ *
+ * Returns: -1 if a recursion happened.
+ *           >= 0 if no recursion
+ */
+static __always_inline int ftrace_test_recursion_trylock(void)
+{
+	return trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+}
+
+/**
+ * ftrace_test_recursion_unlock - called when function callback is complete
+ * @bit: The return of a successful ftrace_test_recursion_trylock()
+ *
+ * This is used at the end of a ftrace callback.
+ */
+static __always_inline void ftrace_test_recursion_unlock(int bit)
+{
+	trace_clear_recursion(bit);
+}
+
 #endif /* CONFIG_TRACING */
 #endif /* _LINUX_TRACE_RECURSION_H */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 2c2126e1871d..943756c01190 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -141,22 +141,20 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 	if (unlikely(!tr->function_enabled))
 		return;
 
+	bit = ftrace_test_recursion_trylock();
+	if (bit < 0)
+		return;
+
 	pc = preempt_count();
 	preempt_disable_notrace();
 
-	bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
-	if (bit < 0)
-		goto out;
-
 	cpu = smp_processor_id();
 	data = per_cpu_ptr(tr->array_buffer.data, cpu);
 	if (!atomic_read(&data->disabled)) {
 		local_save_flags(flags);
 		trace_function(tr, ip, parent_ip, flags, pc);
 	}
-	trace_clear_recursion(bit);
-
- out:
+	ftrace_test_recursion_unlock(bit);
 	preempt_enable_notrace();
 }
 
-- 
cgit v1.2.3


From da5afbeb1724609996ca7bb4fbce2cd104c95914 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 5 Nov 2020 21:32:38 -0500
Subject: ftrace: Optimize testing what context current is in

The preempt_count() is not a simple location in memory, it could be part of
per_cpu code or more. Each access to preempt_count(), or one of its accessor
functions (like in_interrupt()) takes several cycles. By reading
preempt_count() once, and then doing tests to find the context against the
value return is slightly faster than using in_nmi() and in_interrupt().

Link: https://lkml.kernel.org/r/20201028115612.780796355@goodmis.org
Link: https://lkml.kernel.org/r/20201106023546.558881845@goodmis.org

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_recursion.h | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index f2a949dbfec7..ac3d73484cb2 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -117,22 +117,29 @@ enum {
 
 #define TRACE_CONTEXT_MASK	TRACE_LIST_MAX
 
+/*
+ * Used for setting context
+ *  NMI     = 0
+ *  IRQ     = 1
+ *  SOFTIRQ = 2
+ *  NORMAL  = 3
+ */
+enum {
+	TRACE_CTX_NMI,
+	TRACE_CTX_IRQ,
+	TRACE_CTX_SOFTIRQ,
+	TRACE_CTX_NORMAL,
+};
+
 static __always_inline int trace_get_context_bit(void)
 {
-	int bit;
-
-	if (in_interrupt()) {
-		if (in_nmi())
-			bit = 0;
-
-		else if (in_irq())
-			bit = 1;
-		else
-			bit = 2;
-	} else
-		bit = 3;
+	unsigned long pc = preempt_count();
 
-	return bit;
+	if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
+		return TRACE_CTX_NORMAL;
+	else
+		return pc & NMI_MASK ? TRACE_CTX_NMI :
+			pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ;
 }
 
 static __always_inline int trace_test_and_set_recursion(int start, int max)
-- 
cgit v1.2.3


From a25d036d939a30623ff73ecad9c8b9116b02e823 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 5 Nov 2020 21:32:45 -0500
Subject: ftrace: Reverse what the RECURSION flag means in the ftrace_ops

Now that all callbacks are recursion safe, reverse the meaning of the
RECURSION flag and rename it from RECURSION_SAFE to simply RECURSION.
Now only callbacks that request to have recursion protecting it will
have the added trampoline to do so.

Also remove the outdated comment about "PER_CPU" when determining to
use the ftrace_ops_assist_func.

Link: https://lkml.kernel.org/r/20201028115613.742454631@goodmis.org
Link: https://lkml.kernel.org/r/20201106023547.904270143@goodmis.org

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Josh  Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: linux-doc@vger.kernel.org
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace-uses.rst | 82 +++++++++++++++++++++++++++----------
 include/linux/ftrace.h              | 12 +++---
 kernel/trace/fgraph.c               |  3 +-
 kernel/trace/ftrace.c               | 20 ++++-----
 kernel/trace/trace_events.c         |  1 -
 kernel/trace/trace_functions.c      |  2 +-
 kernel/trace/trace_selftest.c       |  7 +---
 kernel/trace/trace_stack.c          |  1 -
 8 files changed, 79 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst
index a4955f7e3d19..86cd14b8e126 100644
--- a/Documentation/trace/ftrace-uses.rst
+++ b/Documentation/trace/ftrace-uses.rst
@@ -30,8 +30,8 @@ The ftrace context
   This requires extra care to what can be done inside a callback. A callback
   can be called outside the protective scope of RCU.
 
-The ftrace infrastructure has some protections against recursions and RCU
-but one must still be very careful how they use the callbacks.
+There are helper functions to help against recursion, and making sure
+RCU is watching. These are explained below.
 
 
 The ftrace_ops structure
@@ -108,6 +108,50 @@ The prototype of the callback function is as follows (as of v4.14):
 	at the start of the function where ftrace was tracing. Otherwise it
 	either contains garbage, or NULL.
 
+Protect your callback
+=====================
+
+As functions can be called from anywhere, and it is possible that a function
+called by a callback may also be traced, and call that same callback,
+recursion protection must be used. There are two helper functions that
+can help in this regard. If you start your code with:
+
+	int bit;
+
+	bit = ftrace_test_recursion_trylock();
+	if (bit < 0)
+		return;
+
+and end it with:
+
+	ftrace_test_recursion_unlock(bit);
+
+The code in between will be safe to use, even if it ends up calling a
+function that the callback is tracing. Note, on success,
+ftrace_test_recursion_trylock() will disable preemption, and the
+ftrace_test_recursion_unlock() will enable it again (if it was previously
+enabled).
+
+Alternatively, if the FTRACE_OPS_FL_RECURSION flag is set on the ftrace_ops
+(as explained below), then a helper trampoline will be used to test
+for recursion for the callback and no recursion test needs to be done.
+But this is at the expense of a slightly more overhead from an extra
+function call.
+
+If your callback accesses any data or critical section that requires RCU
+protection, it is best to make sure that RCU is "watching", otherwise
+that data or critical section will not be protected as expected. In this
+case add:
+
+	if (!rcu_is_watching())
+		return;
+
+Alternatively, if the FTRACE_OPS_FL_RCU flag is set on the ftrace_ops
+(as explained below), then a helper trampoline will be used to test
+for rcu_is_watching for the callback and no other test needs to be done.
+But this is at the expense of a slightly more overhead from an extra
+function call.
+
 
 The ftrace FLAGS
 ================
@@ -128,26 +172,20 @@ FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED
 	will not fail with this flag set. But the callback must check if
 	regs is NULL or not to determine if the architecture supports it.
 
-FTRACE_OPS_FL_RECURSION_SAFE
-	By default, a wrapper is added around the callback to
-	make sure that recursion of the function does not occur. That is,
-	if a function that is called as a result of the callback's execution
-	is also traced, ftrace will prevent the callback from being called
-	again. But this wrapper adds some overhead, and if the callback is
-	safe from recursion, it can set this flag to disable the ftrace
-	protection.
-
-	Note, if this flag is set, and recursion does occur, it could cause
-	the system to crash, and possibly reboot via a triple fault.
-
-	It is OK if another callback traces a function that is called by a
-	callback that is marked recursion safe. Recursion safe callbacks
-	must never trace any function that are called by the callback
-	itself or any nested functions that those functions call.
-
-	If this flag is set, it is possible that the callback will also
-	be called with preemption enabled (when CONFIG_PREEMPTION is set),
-	but this is not guaranteed.
+FTRACE_OPS_FL_RECURSION
+	By default, it is expected that the callback can handle recursion.
+	But if the callback is not that worried about overehead, then
+	setting this bit will add the recursion protection around the
+	callback by calling a helper function that will do the recursion
+	protection and only call the callback if it did not recurse.
+
+	Note, if this flag is not set, and recursion does occur, it could
+	cause the system to crash, and possibly reboot via a triple fault.
+
+	Not, if this flag is set, then the callback will always be called
+	with preemption disabled. If it is not set, then it is possible
+	(but not guaranteed) that the callback will be called in
+	preemptable context.
 
 FTRACE_OPS_FL_IPMODIFY
 	Requires FTRACE_OPS_FL_SAVE_REGS set. If the callback is to "hijack"
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0e4164a7f56d..806196345c3f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -98,7 +98,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
 /*
  * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are
  * set in the flags member.
- * CONTROL, SAVE_REGS, SAVE_REGS_IF_SUPPORTED, RECURSION_SAFE, STUB and
+ * CONTROL, SAVE_REGS, SAVE_REGS_IF_SUPPORTED, RECURSION, STUB and
  * IPMODIFY are a kind of attribute flags which can be set only before
  * registering the ftrace_ops, and can not be modified while registered.
  * Changing those attribute flags after registering ftrace_ops will
@@ -121,10 +121,10 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
  *            passing regs to the handler.
  *            Note, if this flag is set, the SAVE_REGS flag will automatically
  *            get set upon registering the ftrace_ops, if the arch supports it.
- * RECURSION_SAFE - The ftrace_ops can set this to tell the ftrace infrastructure
- *            that the call back has its own recursion protection. If it does
- *            not set this, then the ftrace infrastructure will add recursion
- *            protection for the caller.
+ * RECURSION - The ftrace_ops can set this to tell the ftrace infrastructure
+ *            that the call back needs recursion protection. If it does
+ *            not set this, then the ftrace infrastructure will assume
+ *            that the callback can handle recursion on its own.
  * STUB   - The ftrace_ops is just a place holder.
  * INITIALIZED - The ftrace_ops has already been initialized (first use time
  *            register_ftrace_function() is called, it will initialized the ops)
@@ -156,7 +156,7 @@ enum {
 	FTRACE_OPS_FL_DYNAMIC			= BIT(1),
 	FTRACE_OPS_FL_SAVE_REGS			= BIT(2),
 	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= BIT(3),
-	FTRACE_OPS_FL_RECURSION_SAFE		= BIT(4),
+	FTRACE_OPS_FL_RECURSION			= BIT(4),
 	FTRACE_OPS_FL_STUB			= BIT(5),
 	FTRACE_OPS_FL_INITIALIZED		= BIT(6),
 	FTRACE_OPS_FL_DELETED			= BIT(7),
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 5658f13037b3..73edb9e4f354 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -334,8 +334,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
 
 static struct ftrace_ops graph_ops = {
 	.func			= ftrace_stub,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE |
-				   FTRACE_OPS_FL_INITIALIZED |
+	.flags			= FTRACE_OPS_FL_INITIALIZED |
 				   FTRACE_OPS_FL_PID |
 				   FTRACE_OPS_FL_STUB,
 #ifdef FTRACE_GRAPH_TRAMP_ADDR
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8185f7240095..39f2bba89b76 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -80,7 +80,7 @@ enum {
 
 struct ftrace_ops ftrace_list_end __read_mostly = {
 	.func		= ftrace_stub,
-	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
+	.flags		= FTRACE_OPS_FL_STUB,
 	INIT_OPS_HASH(ftrace_list_end)
 };
 
@@ -866,7 +866,7 @@ static void unregister_ftrace_profiler(void)
 #else
 static struct ftrace_ops ftrace_profile_ops __read_mostly = {
 	.func		= function_profile_call,
-	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+	.flags		= FTRACE_OPS_FL_INITIALIZED,
 	INIT_OPS_HASH(ftrace_profile_ops)
 };
 
@@ -1040,8 +1040,7 @@ struct ftrace_ops global_ops = {
 	.local_hash.notrace_hash	= EMPTY_HASH,
 	.local_hash.filter_hash		= EMPTY_HASH,
 	INIT_OPS_HASH(global_ops)
-	.flags				= FTRACE_OPS_FL_RECURSION_SAFE |
-					  FTRACE_OPS_FL_INITIALIZED |
+	.flags				= FTRACE_OPS_FL_INITIALIZED |
 					  FTRACE_OPS_FL_PID,
 };
 
@@ -2382,7 +2381,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip,
 
 struct ftrace_ops direct_ops = {
 	.func		= call_direct_funcs,
-	.flags		= FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
+	.flags		= FTRACE_OPS_FL_IPMODIFY
 			  | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
 			  | FTRACE_OPS_FL_PERMANENT,
 	/*
@@ -6864,8 +6863,7 @@ void ftrace_init_trace_array(struct trace_array *tr)
 
 struct ftrace_ops global_ops = {
 	.func			= ftrace_stub,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE |
-				  FTRACE_OPS_FL_INITIALIZED |
+	.flags			= FTRACE_OPS_FL_INITIALIZED |
 				  FTRACE_OPS_FL_PID,
 };
 
@@ -7023,11 +7021,11 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func);
 ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 {
 	/*
-	 * If the function does not handle recursion, needs to be RCU safe,
-	 * or does per cpu logic, then we need to call the assist handler.
+	 * If the function does not handle recursion or needs to be RCU safe,
+	 * then we need to call the assist handler.
 	 */
-	if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
-	    ops->flags & FTRACE_OPS_FL_RCU)
+	if (ops->flags & (FTRACE_OPS_FL_RECURSION |
+			  FTRACE_OPS_FL_RCU))
 		return ftrace_ops_assist_func;
 
 	return ops->func;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 47a71f96e5bc..244abbcd1db5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3712,7 +3712,6 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
 static struct ftrace_ops trace_ops __initdata  =
 {
 	.func = function_test_events_call,
-	.flags = FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static __init void event_trace_self_test_with_function(void)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 943756c01190..89c414ce1388 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -48,7 +48,7 @@ int ftrace_allocate_ftrace_ops(struct trace_array *tr)
 
 	/* Currently only the non stack version is supported */
 	ops->func = function_trace_call;
-	ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
+	ops->flags = FTRACE_OPS_FL_PID;
 
 	tr->ops = ops;
 	ops->private = tr;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 4738ad48a667..8ee3c0bb5d8a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -150,17 +150,14 @@ static void trace_selftest_test_dyn_func(unsigned long ip,
 
 static struct ftrace_ops test_probe1 = {
 	.func			= trace_selftest_test_probe1_func,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static struct ftrace_ops test_probe2 = {
 	.func			= trace_selftest_test_probe2_func,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static struct ftrace_ops test_probe3 = {
 	.func			= trace_selftest_test_probe3_func,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static void print_counts(void)
@@ -448,11 +445,11 @@ static void trace_selftest_test_recursion_safe_func(unsigned long ip,
 
 static struct ftrace_ops test_rec_probe = {
 	.func			= trace_selftest_test_recursion_func,
+	.flags			= FTRACE_OPS_FL_RECURSION,
 };
 
 static struct ftrace_ops test_recsafe_probe = {
 	.func			= trace_selftest_test_recursion_safe_func,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static int
@@ -561,7 +558,7 @@ static void trace_selftest_test_regs_func(unsigned long ip,
 
 static struct ftrace_ops test_regs_probe = {
 	.func		= trace_selftest_test_regs_func,
-	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS,
+	.flags		= FTRACE_OPS_FL_SAVE_REGS,
 };
 
 static int
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c408423e5d65..969db526a563 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -318,7 +318,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
 static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = stack_trace_call,
-	.flags = FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
 static ssize_t
-- 
cgit v1.2.3


From 773c16705058e9be7b0f4ce124e89cd231c120a2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 5 Nov 2020 21:32:46 -0500
Subject: ftrace: Add recording of functions that caused recursion

This adds CONFIG_FTRACE_RECORD_RECURSION that will record to a file
"recursed_functions" all the functions that caused recursion while a
callback to the function tracer was running.

Link: https://lkml.kernel.org/r/20201106023548.102375687@goodmis.org

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Guo Ren <guoren@kernel.org>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Miroslav Benes <mbenes@suse.cz>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Cc: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-csky@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s390@vger.kernel.org
Cc: live-patching@vger.kernel.org
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 Documentation/trace/ftrace-uses.rst   |   6 +-
 arch/csky/kernel/probes/ftrace.c      |   2 +-
 arch/parisc/kernel/ftrace.c           |   2 +-
 arch/powerpc/kernel/kprobes-ftrace.c  |   2 +-
 arch/s390/kernel/ftrace.c             |   2 +-
 arch/x86/kernel/kprobes/ftrace.c      |   2 +-
 fs/pstore/ftrace.c                    |   2 +-
 include/linux/trace_recursion.h       |  29 ++++-
 kernel/livepatch/patch.c              |   2 +-
 kernel/trace/Kconfig                  |  25 ++++
 kernel/trace/Makefile                 |   1 +
 kernel/trace/ftrace.c                 |   4 +-
 kernel/trace/trace_event_perf.c       |   2 +-
 kernel/trace/trace_functions.c        |   2 +-
 kernel/trace/trace_output.c           |   6 +-
 kernel/trace/trace_output.h           |   1 +
 kernel/trace/trace_recursion_record.c | 236 ++++++++++++++++++++++++++++++++++
 17 files changed, 306 insertions(+), 20 deletions(-)
 create mode 100644 kernel/trace/trace_recursion_record.c

(limited to 'include/linux')

diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst
index 86cd14b8e126..5981d5691745 100644
--- a/Documentation/trace/ftrace-uses.rst
+++ b/Documentation/trace/ftrace-uses.rst
@@ -118,7 +118,7 @@ can help in this regard. If you start your code with:
 
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
@@ -130,7 +130,9 @@ The code in between will be safe to use, even if it ends up calling a
 function that the callback is tracing. Note, on success,
 ftrace_test_recursion_trylock() will disable preemption, and the
 ftrace_test_recursion_unlock() will enable it again (if it was previously
-enabled).
+enabled). The instruction pointer (ip) and its parent (parent_ip) is passed to
+ftrace_test_recursion_trylock() to record where the recursion happened
+(if CONFIG_FTRACE_RECORD_RECURSION is set).
 
 Alternatively, if the FTRACE_OPS_FL_RECURSION flag is set on the ftrace_ops
 (as explained below), then a helper trampoline will be used to test
diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c
index 5eb2604fdf71..f30b179924ef 100644
--- a/arch/csky/kernel/probes/ftrace.c
+++ b/arch/csky/kernel/probes/ftrace.c
@@ -18,7 +18,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
index 13d85042810a..1c5d3732bda2 100644
--- a/arch/parisc/kernel/ftrace.c
+++ b/arch/parisc/kernel/ftrace.c
@@ -210,7 +210,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe *p;
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
index 5df8d50c65ae..fdfee39938ea 100644
--- a/arch/powerpc/kernel/kprobes-ftrace.c
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -20,7 +20,7 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
 	struct kprobe_ctlblk *kcb;
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(nip, parent_nip);
 	if (bit < 0)
 		return;
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 8f31c726537a..657c1ab45408 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -204,7 +204,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe *p;
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index a40a6cdfcca3..954d930a7127 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -20,7 +20,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe_ctlblk *kcb;
 	int bit;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 816210fc5d3a..adb0935eb062 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -41,7 +41,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
 	if (unlikely(oops_in_progress))
 		return;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index ac3d73484cb2..228cc56ed66e 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -91,6 +91,9 @@ enum {
 	 * not be correct. Allow for a single recursion to cover this case.
 	 */
 	TRACE_TRANSITION_BIT,
+
+	/* Used to prevent recursion recording from recursing. */
+	TRACE_RECORD_RECURSION_BIT,
 };
 
 #define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0)
@@ -142,7 +145,22 @@ static __always_inline int trace_get_context_bit(void)
 			pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ;
 }
 
-static __always_inline int trace_test_and_set_recursion(int start, int max)
+#ifdef CONFIG_FTRACE_RECORD_RECURSION
+extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
+# define do_ftrace_record_recursion(ip, pip)				\
+	do {								\
+		if (!trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \
+			trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \
+			ftrace_record_recursion(ip, pip);		\
+			trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \
+		}							\
+	} while (0)
+#else
+# define do_ftrace_record_recursion(ip, pip)	do { } while (0)
+#endif
+
+static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip,
+							int start, int max)
 {
 	unsigned int val = current->trace_recursion;
 	int bit;
@@ -158,8 +176,10 @@ static __always_inline int trace_test_and_set_recursion(int start, int max)
 		 * a switch between contexts. Allow for a single recursion.
 		 */
 		bit = TRACE_TRANSITION_BIT;
-		if (trace_recursion_test(bit))
+		if (trace_recursion_test(bit)) {
+			do_ftrace_record_recursion(ip, pip);
 			return -1;
+		}
 		trace_recursion_set(bit);
 		barrier();
 		return bit + 1;
@@ -199,9 +219,10 @@ static __always_inline void trace_clear_recursion(int bit)
  * Returns: -1 if a recursion happened.
  *           >= 0 if no recursion
  */
-static __always_inline int ftrace_test_recursion_trylock(void)
+static __always_inline int ftrace_test_recursion_trylock(unsigned long ip,
+							 unsigned long parent_ip)
 {
-	return trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+	return trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, TRACE_FTRACE_MAX);
 }
 
 /**
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 15480bf3ce88..875c5dbbdd33 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -49,7 +49,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 
 	ops = container_of(fops, struct klp_ops, fops);
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (WARN_ON_ONCE(bit < 0))
 		return;
 	/*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a4020c0b4508..9b11c096d139 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -727,6 +727,31 @@ config TRACE_EVAL_MAP_FILE
 
 	If unsure, say N.
 
+config FTRACE_RECORD_RECURSION
+	bool "Record functions that recurse in function tracing"
+	depends on FUNCTION_TRACER
+	help
+	  All callbacks that attach to the function tracing have some sort
+	  of protection against recursion. Even though the protection exists,
+	  it adds overhead. This option will create a file in the tracefs
+	  file system called "recursed_functions" that will list the functions
+	  that triggered a recursion.
+
+	  This will add more overhead to cases that have recursion.
+
+	  If unsure, say N
+
+config FTRACE_RECORD_RECURSION_SIZE
+	int "Max number of recursed functions to record"
+	default	128
+	depends on FTRACE_RECORD_RECURSION
+	help
+	  This defines the limit of number of functions that can be
+	  listed in the "recursed_functions" file, that lists all
+	  the functions that caused a recursion to happen.
+	  This file can be reset, but the limit can not change in
+	  size at runtime.
+
 config GCOV_PROFILE_FTRACE
 	bool "Enable GCOV profiling on ftrace subsystem"
 	depends on GCOV_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index e153be351548..7e44cea89fdc 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -92,6 +92,7 @@ obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o
 obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
 obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
 obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
+obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
 
 obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 39f2bba89b76..03aad2b5cd5e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6918,7 +6918,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 	struct ftrace_ops *op;
 	int bit;
 
-	bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+	bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
 	if (bit < 0)
 		return;
 
@@ -6993,7 +6993,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
 {
 	int bit;
 
-	bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+	bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
 	if (bit < 0)
 		return;
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index a2b9fddb8148..1b202e28dfaa 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -447,7 +447,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 	if ((unsigned long)ops->private != smp_processor_id())
 		return;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 89c414ce1388..646eda6c44a5 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -141,7 +141,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 	if (unlikely(!tr->function_enabled))
 		return;
 
-	bit = ftrace_test_recursion_trylock();
+	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 000e9dc224c6..92b1575ae0ca 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,8 +353,8 @@ static inline const char *kretprobed(const char *name)
 }
 #endif /* CONFIG_KRETPROBES */
 
-static void
-seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
+void
+trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset)
 {
 #ifdef CONFIG_KALLSYMS
 	char str[KSYM_SYMBOL_LEN];
@@ -420,7 +420,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 		goto out;
 	}
 
-	seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
+	trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET);
 
 	if (sym_flags & TRACE_ITER_SYM_ADDR)
 		trace_seq_printf(s, " <" IP_FMT ">", ip);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 2f742b74e7e6..4c954636caf0 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -16,6 +16,7 @@ extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
 
+extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset);
 extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c
new file mode 100644
index 000000000000..b2edac1fe156
--- /dev/null
+++ b/kernel/trace/trace_recursion_record.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+
+struct recursed_functions {
+	unsigned long		ip;
+	unsigned long		parent_ip;
+};
+
+static struct recursed_functions recursed_functions[CONFIG_FTRACE_RECORD_RECURSION_SIZE];
+static atomic_t nr_records;
+
+/*
+ * Cache the last found function. Yes, updates to this is racey, but
+ * so is memory cache ;-)
+ */
+static unsigned long cached_function;
+
+void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip)
+{
+	int index = 0;
+	int i;
+	unsigned long old;
+
+ again:
+	/* First check the last one recorded */
+	if (ip == cached_function)
+		return;
+
+	i = atomic_read(&nr_records);
+	/* nr_records is -1 when clearing records */
+	smp_mb__after_atomic();
+	if (i < 0)
+		return;
+
+	/*
+	 * If there's two writers and this writer comes in second,
+	 * the cmpxchg() below to update the ip will fail. Then this
+	 * writer will try again. It is possible that index will now
+	 * be greater than nr_records. This is because the writer
+	 * that succeeded has not updated the nr_records yet.
+	 * This writer could keep trying again until the other writer
+	 * updates nr_records. But if the other writer takes an
+	 * interrupt, and that interrupt locks up that CPU, we do
+	 * not want this CPU to lock up due to the recursion protection,
+	 * and have a bug report showing this CPU as the cause of
+	 * locking up the computer. To not lose this record, this
+	 * writer will simply use the next position to update the
+	 * recursed_functions, and it will update the nr_records
+	 * accordingly.
+	 */
+	if (index < i)
+		index = i;
+	if (index >= CONFIG_FTRACE_RECORD_RECURSION_SIZE)
+		return;
+
+	for (i = index - 1; i >= 0; i--) {
+		if (recursed_functions[i].ip == ip) {
+			cached_function = ip;
+			return;
+		}
+	}
+
+	cached_function = ip;
+
+	/*
+	 * We only want to add a function if it hasn't been added before.
+	 * Add to the current location before incrementing the count.
+	 * If it fails to add, then increment the index (save in i)
+	 * and try again.
+	 */
+	old = cmpxchg(&recursed_functions[index].ip, 0, ip);
+	if (old != 0) {
+		/* Did something else already added this for us? */
+		if (old == ip)
+			return;
+		/* Try the next location (use i for the next index) */
+		index++;
+		goto again;
+	}
+
+	recursed_functions[index].parent_ip = parent_ip;
+
+	/*
+	 * It's still possible that we could race with the clearing
+	 *    CPU0                                    CPU1
+	 *    ----                                    ----
+	 *                                       ip = func
+	 *  nr_records = -1;
+	 *  recursed_functions[0] = 0;
+	 *                                       i = -1
+	 *                                       if (i < 0)
+	 *  nr_records = 0;
+	 *  (new recursion detected)
+	 *      recursed_functions[0] = func
+	 *                                            cmpxchg(recursed_functions[0],
+	 *                                                    func, 0)
+	 *
+	 * But the worse that could happen is that we get a zero in
+	 * the recursed_functions array, and it's likely that "func" will
+	 * be recorded again.
+	 */
+	i = atomic_read(&nr_records);
+	smp_mb__after_atomic();
+	if (i < 0)
+		cmpxchg(&recursed_functions[index].ip, ip, 0);
+	else if (i <= index)
+		atomic_cmpxchg(&nr_records, i, index + 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_record_recursion);
+
+static DEFINE_MUTEX(recursed_function_lock);
+static struct trace_seq *tseq;
+
+static void *recursed_function_seq_start(struct seq_file *m, loff_t *pos)
+{
+	void *ret = NULL;
+	int index;
+
+	mutex_lock(&recursed_function_lock);
+	index = atomic_read(&nr_records);
+	if (*pos < index) {
+		ret = &recursed_functions[*pos];
+	}
+
+	tseq = kzalloc(sizeof(*tseq), GFP_KERNEL);
+	if (!tseq)
+		return ERR_PTR(-ENOMEM);
+
+	trace_seq_init(tseq);
+
+	return ret;
+}
+
+static void *recursed_function_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	int index;
+	int p;
+
+	index = atomic_read(&nr_records);
+	p = ++(*pos);
+
+	return p < index ? &recursed_functions[p] : NULL;
+}
+
+static void recursed_function_seq_stop(struct seq_file *m, void *v)
+{
+	kfree(tseq);
+	mutex_unlock(&recursed_function_lock);
+}
+
+static int recursed_function_seq_show(struct seq_file *m, void *v)
+{
+	struct recursed_functions *record = v;
+	int ret = 0;
+
+	if (record) {
+		trace_seq_print_sym(tseq, record->parent_ip, true);
+		trace_seq_puts(tseq, ":\t");
+		trace_seq_print_sym(tseq, record->ip, true);
+		trace_seq_putc(tseq, '\n');
+		ret = trace_print_seq(m, tseq);
+	}
+
+	return ret;
+}
+
+static const struct seq_operations recursed_function_seq_ops = {
+	.start  = recursed_function_seq_start,
+	.next   = recursed_function_seq_next,
+	.stop   = recursed_function_seq_stop,
+	.show   = recursed_function_seq_show
+};
+
+static int recursed_function_open(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	mutex_lock(&recursed_function_lock);
+	/* If this file was opened for write, then erase contents */
+	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+		/* disable updating records */
+		atomic_set(&nr_records, -1);
+		smp_mb__after_atomic();
+		memset(recursed_functions, 0, sizeof(recursed_functions));
+		smp_wmb();
+		/* enable them again */
+		atomic_set(&nr_records, 0);
+	}
+	if (file->f_mode & FMODE_READ)
+		ret = seq_open(file, &recursed_function_seq_ops);
+	mutex_unlock(&recursed_function_lock);
+
+	return ret;
+}
+
+static ssize_t recursed_function_write(struct file *file,
+				       const char __user *buffer,
+				       size_t count, loff_t *ppos)
+{
+	return count;
+}
+
+static int recursed_function_release(struct inode *inode, struct file *file)
+{
+	if (file->f_mode & FMODE_READ)
+		seq_release(inode, file);
+	return 0;
+}
+
+static const struct file_operations recursed_functions_fops = {
+	.open           = recursed_function_open,
+	.write		= recursed_function_write,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = recursed_function_release,
+};
+
+__init static int create_recursed_functions(void)
+{
+	struct dentry *dentry;
+
+	dentry = trace_create_file("recursed_functions", 0644, NULL, NULL,
+				   &recursed_functions_fops);
+	if (!dentry)
+		pr_warn("WARNING: Failed to create recursed_functions\n");
+	return 0;
+}
+
+fs_initcall(create_recursed_functions);
-- 
cgit v1.2.3


From 22447a99c97e353bde8f90c2353873f27681d57c Mon Sep 17 00:00:00 2001
From: Pawel Czarnecki <pczarnecki@internships.antmicro.com>
Date: Tue, 13 Oct 2020 16:45:52 +0200
Subject: drivers/soc/litex: add LiteX SoC Controller driver

This commit adds driver for the FPGA-based LiteX SoC
Controller from LiteX SoC builder.

Co-developed-by: Mateusz Holenko <mholenko@antmicro.com>
Signed-off-by: Mateusz Holenko <mholenko@antmicro.com>
Signed-off-by: Pawel Czarnecki <pczarnecki@internships.antmicro.com>
Signed-off-by: Stafford Horne <shorne@gmail.com>
---
 MAINTAINERS                        |   2 +
 drivers/soc/Kconfig                |   1 +
 drivers/soc/Makefile               |   1 +
 drivers/soc/litex/Kconfig          |  19 ++++
 drivers/soc/litex/Makefile         |   3 +
 drivers/soc/litex/litex_soc_ctrl.c | 176 +++++++++++++++++++++++++++++++++++++
 include/linux/litex.h              | 102 +++++++++++++++++++++
 7 files changed, 304 insertions(+)
 create mode 100644 drivers/soc/litex/Kconfig
 create mode 100644 drivers/soc/litex/Makefile
 create mode 100644 drivers/soc/litex/litex_soc_ctrl.c
 create mode 100644 include/linux/litex.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 049af639fdfc..3fde022413f9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10177,6 +10177,8 @@ M:	Karol Gugala <kgugala@antmicro.com>
 M:	Mateusz Holenko <mholenko@antmicro.com>
 S:	Maintained
 F:	Documentation/devicetree/bindings/*/litex,*.yaml
+F:	drivers/soc/litex/litex_soc_ctrl.c
+F:	include/linux/litex.h
 
 LIVE PATCHING
 M:	Josh Poimboeuf <jpoimboe@redhat.com>
diff --git a/drivers/soc/Kconfig b/drivers/soc/Kconfig
index 425ab6f7e375..d097d070f579 100644
--- a/drivers/soc/Kconfig
+++ b/drivers/soc/Kconfig
@@ -9,6 +9,7 @@ source "drivers/soc/bcm/Kconfig"
 source "drivers/soc/fsl/Kconfig"
 source "drivers/soc/imx/Kconfig"
 source "drivers/soc/ixp4xx/Kconfig"
+source "drivers/soc/litex/Kconfig"
 source "drivers/soc/mediatek/Kconfig"
 source "drivers/soc/qcom/Kconfig"
 source "drivers/soc/renesas/Kconfig"
diff --git a/drivers/soc/Makefile b/drivers/soc/Makefile
index 36452bed86ef..0b16108823ef 100644
--- a/drivers/soc/Makefile
+++ b/drivers/soc/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_ARCH_GEMINI)	+= gemini/
 obj-y				+= imx/
 obj-$(CONFIG_ARCH_IXP4XX)	+= ixp4xx/
 obj-$(CONFIG_SOC_XWAY)		+= lantiq/
+obj-$(CONFIG_LITEX_SOC_CONTROLLER) += litex/
 obj-y				+= mediatek/
 obj-y				+= amlogic/
 obj-y				+= qcom/
diff --git a/drivers/soc/litex/Kconfig b/drivers/soc/litex/Kconfig
new file mode 100644
index 000000000000..7c6b009b6f6c
--- /dev/null
+++ b/drivers/soc/litex/Kconfig
@@ -0,0 +1,19 @@
+# SPDX-License_Identifier: GPL-2.0
+
+menu "Enable LiteX SoC Builder specific drivers"
+
+config LITEX
+	bool
+
+config LITEX_SOC_CONTROLLER
+	tristate "Enable LiteX SoC Controller driver"
+	depends on OF || COMPILE_TEST
+	select LITEX
+	help
+	  This option enables the SoC Controller Driver which verifies
+	  LiteX CSR access and provides common litex_get_reg/litex_set_reg
+	  accessors.
+	  All drivers that use functions from litex.h must depend on
+	  LITEX.
+
+endmenu
diff --git a/drivers/soc/litex/Makefile b/drivers/soc/litex/Makefile
new file mode 100644
index 000000000000..98ff7325b1c0
--- /dev/null
+++ b/drivers/soc/litex/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License_Identifier: GPL-2.0
+
+obj-$(CONFIG_LITEX_SOC_CONTROLLER)	+= litex_soc_ctrl.o
diff --git a/drivers/soc/litex/litex_soc_ctrl.c b/drivers/soc/litex/litex_soc_ctrl.c
new file mode 100644
index 000000000000..1217cafdfd4d
--- /dev/null
+++ b/drivers/soc/litex/litex_soc_ctrl.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * LiteX SoC Controller Driver
+ *
+ * Copyright (C) 2020 Antmicro <www.antmicro.com>
+ *
+ */
+
+#include <linux/litex.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+
+/*
+ * LiteX SoC Generator, depending on the configuration, can split a single
+ * logical CSR (Control&Status Register) into a series of consecutive physical
+ * registers.
+ *
+ * For example, in the configuration with 8-bit CSR Bus, 32-bit aligned (the
+ * default one for 32-bit CPUs) a 32-bit logical CSR will be generated as four
+ * 32-bit physical registers, each one containing one byte of meaningful data.
+ *
+ * For details see: https://github.com/enjoy-digital/litex/wiki/CSR-Bus
+ *
+ * The purpose of `litex_set_reg`/`litex_get_reg` is to implement the logic
+ * of writing to/reading from the LiteX CSR in a single place that can be
+ * then reused by all LiteX drivers.
+ */
+
+/**
+ * litex_set_reg() - Writes the value to the LiteX CSR (Control&Status Register)
+ * @reg: Address of the CSR
+ * @reg_size: The width of the CSR expressed in the number of bytes
+ * @val: Value to be written to the CSR
+ *
+ * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
+ * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
+ * each one containing one byte of meaningful data.
+ *
+ * This function splits a single possibly multi-byte write into a series of
+ * single-byte writes with a proper offset.
+ */
+void litex_set_reg(void __iomem *reg, unsigned long reg_size,
+		    unsigned long val)
+{
+	unsigned long shifted_data, shift, i;
+
+	for (i = 0; i < reg_size; ++i) {
+		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
+		shifted_data = val >> shift;
+
+		WRITE_LITEX_SUBREGISTER(shifted_data, reg, i);
+	}
+}
+EXPORT_SYMBOL_GPL(litex_set_reg);
+
+/**
+ * litex_get_reg() - Reads the value of the LiteX CSR (Control&Status Register)
+ * @reg: Address of the CSR
+ * @reg_size: The width of the CSR expressed in the number of bytes
+ *
+ * Return: Value read from the CSR
+ *
+ * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned),
+ * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers,
+ * each one containing one byte of meaningful data.
+ *
+ * This function generates a series of single-byte reads with a proper offset
+ * and joins their results into a single multi-byte value.
+ */
+unsigned long litex_get_reg(void __iomem *reg, unsigned long reg_size)
+{
+	unsigned long shifted_data, shift, i;
+	unsigned long result = 0;
+
+	for (i = 0; i < reg_size; ++i) {
+		shifted_data = READ_LITEX_SUBREGISTER(reg, i);
+
+		shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT);
+		result |= (shifted_data << shift);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(litex_get_reg);
+
+#define SCRATCH_REG_OFF         0x04
+#define SCRATCH_REG_VALUE       0x12345678
+#define SCRATCH_TEST_VALUE      0xdeadbeef
+
+/*
+ * Check LiteX CSR read/write access
+ *
+ * This function reads and writes a scratch register in order to verify if CSR
+ * access works.
+ *
+ * In case any problems are detected, the driver should panic.
+ *
+ * Access to the LiteX CSR is, by design, done in CPU native endianness.
+ * The driver should not dynamically configure access functions when
+ * the endianness mismatch is detected. Such situation indicates problems in
+ * the soft SoC design and should be solved at the LiteX generator level,
+ * not in the software.
+ */
+static int litex_check_csr_access(void __iomem *reg_addr)
+{
+	unsigned long reg;
+
+	reg = litex_read32(reg_addr + SCRATCH_REG_OFF);
+
+	if (reg != SCRATCH_REG_VALUE) {
+		panic("Scratch register read error - the system is probably broken! Expected: 0x%x but got: 0x%lx",
+			SCRATCH_REG_VALUE, reg);
+		return -EINVAL;
+	}
+
+	litex_write32(reg_addr + SCRATCH_REG_OFF, SCRATCH_TEST_VALUE);
+	reg = litex_read32(reg_addr + SCRATCH_REG_OFF);
+
+	if (reg != SCRATCH_TEST_VALUE) {
+		panic("Scratch register write error - the system is probably broken! Expected: 0x%x but got: 0x%lx",
+			SCRATCH_TEST_VALUE, reg);
+		return -EINVAL;
+	}
+
+	/* restore original value of the SCRATCH register */
+	litex_write32(reg_addr + SCRATCH_REG_OFF, SCRATCH_REG_VALUE);
+
+	pr_info("LiteX SoC Controller driver initialized");
+
+	return 0;
+}
+
+struct litex_soc_ctrl_device {
+	void __iomem *base;
+};
+
+static const struct of_device_id litex_soc_ctrl_of_match[] = {
+	{.compatible = "litex,soc-controller"},
+	{},
+};
+
+MODULE_DEVICE_TABLE(of, litex_soc_ctrl_of_match);
+
+static int litex_soc_ctrl_probe(struct platform_device *pdev)
+{
+	struct litex_soc_ctrl_device *soc_ctrl_dev;
+
+	soc_ctrl_dev = devm_kzalloc(&pdev->dev, sizeof(*soc_ctrl_dev), GFP_KERNEL);
+	if (!soc_ctrl_dev)
+		return -ENOMEM;
+
+	soc_ctrl_dev->base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(soc_ctrl_dev->base))
+		return PTR_ERR(soc_ctrl_dev->base);
+
+	return litex_check_csr_access(soc_ctrl_dev->base);
+}
+
+static struct platform_driver litex_soc_ctrl_driver = {
+	.driver = {
+		.name = "litex-soc-controller",
+		.of_match_table = of_match_ptr(litex_soc_ctrl_of_match)
+	},
+	.probe = litex_soc_ctrl_probe,
+};
+
+module_platform_driver(litex_soc_ctrl_driver);
+MODULE_DESCRIPTION("LiteX SoC Controller driver");
+MODULE_AUTHOR("Antmicro <www.antmicro.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/litex.h b/include/linux/litex.h
new file mode 100644
index 000000000000..40f5be503593
--- /dev/null
+++ b/include/linux/litex.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common LiteX header providing
+ * helper functions for accessing CSRs.
+ *
+ * Implementation of the functions is provided by
+ * the LiteX SoC Controller driver.
+ *
+ * Copyright (C) 2019-2020 Antmicro <www.antmicro.com>
+ */
+
+#ifndef _LINUX_LITEX_H
+#define _LINUX_LITEX_H
+
+#include <linux/io.h>
+#include <linux/types.h>
+#include <linux/compiler_types.h>
+
+/*
+ * The parameters below are true for LiteX SoCs configured for 8-bit CSR Bus,
+ * 32-bit aligned.
+ *
+ * Supporting other configurations will require extending the logic in this
+ * header and in the LiteX SoC controller driver.
+ */
+#define LITEX_REG_SIZE	  0x4
+#define LITEX_SUBREG_SIZE	0x1
+#define LITEX_SUBREG_SIZE_BIT	 (LITEX_SUBREG_SIZE * 8)
+
+#define WRITE_LITEX_SUBREGISTER(val, base_offset, subreg_id) \
+	writel((u32 __force)cpu_to_le32(val), base_offset + (LITEX_REG_SIZE * subreg_id))
+
+#define READ_LITEX_SUBREGISTER(base_offset, subreg_id) \
+	le32_to_cpu((__le32 __force)readl(base_offset + (LITEX_REG_SIZE * subreg_id)))
+
+void litex_set_reg(void __iomem *reg, unsigned long reg_sz, unsigned long val);
+
+unsigned long litex_get_reg(void __iomem *reg, unsigned long reg_sz);
+
+static inline void litex_write8(void __iomem *reg, u8 val)
+{
+	WRITE_LITEX_SUBREGISTER(val, reg, 0);
+}
+
+static inline void litex_write16(void __iomem *reg, u16 val)
+{
+	WRITE_LITEX_SUBREGISTER(val >> 8, reg, 0);
+	WRITE_LITEX_SUBREGISTER(val, reg, 1);
+}
+
+static inline void litex_write32(void __iomem *reg, u32 val)
+{
+	WRITE_LITEX_SUBREGISTER(val >> 24, reg, 0);
+	WRITE_LITEX_SUBREGISTER(val >> 16, reg, 1);
+	WRITE_LITEX_SUBREGISTER(val >> 8, reg, 2);
+	WRITE_LITEX_SUBREGISTER(val, reg, 3);
+}
+
+static inline void litex_write64(void __iomem *reg, u64 val)
+{
+	WRITE_LITEX_SUBREGISTER(val >> 56, reg, 0);
+	WRITE_LITEX_SUBREGISTER(val >> 48, reg, 1);
+	WRITE_LITEX_SUBREGISTER(val >> 40, reg, 2);
+	WRITE_LITEX_SUBREGISTER(val >> 32, reg, 3);
+	WRITE_LITEX_SUBREGISTER(val >> 24, reg, 4);
+	WRITE_LITEX_SUBREGISTER(val >> 16, reg, 5);
+	WRITE_LITEX_SUBREGISTER(val >> 8, reg, 6);
+	WRITE_LITEX_SUBREGISTER(val, reg, 7);
+}
+
+static inline u8 litex_read8(void __iomem *reg)
+{
+	return READ_LITEX_SUBREGISTER(reg, 0);
+}
+
+static inline u16 litex_read16(void __iomem *reg)
+{
+	return (READ_LITEX_SUBREGISTER(reg, 0) << 8)
+		| (READ_LITEX_SUBREGISTER(reg, 1));
+}
+
+static inline u32 litex_read32(void __iomem *reg)
+{
+	return (READ_LITEX_SUBREGISTER(reg, 0) << 24)
+		| (READ_LITEX_SUBREGISTER(reg, 1) << 16)
+		| (READ_LITEX_SUBREGISTER(reg, 2) << 8)
+		| (READ_LITEX_SUBREGISTER(reg, 3));
+}
+
+static inline u64 litex_read64(void __iomem *reg)
+{
+	return ((u64)READ_LITEX_SUBREGISTER(reg, 0) << 56)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 1) << 48)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 2) << 40)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 3) << 32)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 4) << 24)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 5) << 16)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 6) << 8)
+		| ((u64)READ_LITEX_SUBREGISTER(reg, 7));
+}
+
+#endif /* _LINUX_LITEX_H */
-- 
cgit v1.2.3


From 60602cb549f1965a7edbc96026760dfb93911fab Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 28 Oct 2020 08:19:24 -0400
Subject: fgraph: Make overruns 4 bytes in graph stack structure

Inspecting the data structures of the function graph tracer, I found that
the overrun value is unsigned long, which is 8 bytes on a 64 bit machine,
and not only that, the depth is an int (4 bytes). The overrun can be simply
an unsigned int (4 bytes) and pack the ftrace_graph_ret structure better.

The depth is moved up next to the func, as it is used more often with func,
and improves cache locality.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h               | 4 ++--
 kernel/trace/trace_entries.h         | 4 ++--
 kernel/trace/trace_functions_graph.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 806196345c3f..8dde9c17aaa5 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -864,11 +864,11 @@ struct ftrace_graph_ent {
  */
 struct ftrace_graph_ret {
 	unsigned long func; /* Current function */
+	int depth;
 	/* Number of functions that overran the depth limit for current task */
-	unsigned long overrun;
+	unsigned int overrun;
 	unsigned long long calltime;
 	unsigned long long rettime;
-	int depth;
 } __packed;
 
 /* Type of the callback handlers for tracing function graph*/
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 18c4a58aff79..ceafe2dc97e1 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -93,10 +93,10 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
 	F_STRUCT(
 		__field_struct(	struct ftrace_graph_ret,	ret	)
 		__field_packed(	unsigned long,	ret,		func	)
-		__field_packed(	unsigned long,	ret,		overrun	)
+		__field_packed(	int,		ret,		depth	)
+		__field_packed(	unsigned int,	ret,		overrun	)
 		__field_packed(	unsigned long long, ret,	calltime)
 		__field_packed(	unsigned long long, ret,	rettime	)
-		__field_packed(	int,		ret,		depth	)
 	),
 
 	F_printk("<-- %ps (%d) (start: %llx  end: %llx) over: %d",
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 60d66278aa0d..d874dec87131 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -957,7 +957,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 
 	/* Overrun */
 	if (flags & TRACE_GRAPH_PRINT_OVERRUN)
-		trace_seq_printf(s, " (Overruns: %lu)\n",
+		trace_seq_printf(s, " (Overruns: %u)\n",
 				 trace->overrun);
 
 	print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
-- 
cgit v1.2.3


From 7b68621f8d16689cbb4203aceaca86ffb165f1d0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 30 Oct 2020 17:21:00 -0400
Subject: ftrace: Clean up the recursion code a bit

In trace_test_and_set_recursion(), current->trace_recursion is placed into a
variable, and that variable should be used for the processing, as there's no
reason to dereference current multiple times.

On trace_clear_recursion(), current->trace_recursion is modified and there's
no reason to copy it over to a variable.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_recursion.h | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index 228cc56ed66e..a9f9c5714e65 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -162,7 +162,7 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
 static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip,
 							int start, int max)
 {
-	unsigned int val = current->trace_recursion;
+	unsigned int val = READ_ONCE(current->trace_recursion);
 	int bit;
 
 	/* A previous recursion check was made */
@@ -176,18 +176,15 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
 		 * a switch between contexts. Allow for a single recursion.
 		 */
 		bit = TRACE_TRANSITION_BIT;
-		if (trace_recursion_test(bit)) {
+		if (val & (1 << bit)) {
 			do_ftrace_record_recursion(ip, pip);
 			return -1;
 		}
-		trace_recursion_set(bit);
-		barrier();
-		return bit + 1;
+	} else {
+		/* Normal check passed, clear the transition to allow it again */
+		val &= ~(1 << TRACE_TRANSITION_BIT);
 	}
 
-	/* Normal check passed, clear the transition to allow it again */
-	trace_recursion_clear(TRACE_TRANSITION_BIT);
-
 	val |= 1 << bit;
 	current->trace_recursion = val;
 	barrier();
@@ -197,17 +194,12 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign
 
 static __always_inline void trace_clear_recursion(int bit)
 {
-	unsigned int val = current->trace_recursion;
-
 	if (!bit)
 		return;
 
-	bit--;
-	bit = 1 << bit;
-	val &= ~bit;
-
 	barrier();
-	current->trace_recursion = val;
+	bit--;
+	trace_recursion_clear(bit);
 }
 
 /**
-- 
cgit v1.2.3


From cd2c40ff90b0e385c18f881ab5e17f7137864223 Mon Sep 17 00:00:00 2001
From: Prashant Malani <pmalani@chromium.org>
Date: Thu, 29 Oct 2020 15:27:36 -0700
Subject: platform/chrome: cros_ec: Import Type C host commands

Import the EC_CMD_TYPEC_STATUS and EC_CMD_TYPEC_DISCOVERY Chrome OS EC
host commands from the EC code base [1].

These commands can be used by the application processor to query Power
Delivery (PD) discovery information concerning connected Type C
peripherals.

Also add the EC_FEATURE_TYPEC_CMD feature flag, which is used to
determine whether these commands are supported by the EC.

[1]:
https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/master/include/ec_commands.h

Signed-off-by: Prashant Malani <pmalani@chromium.org>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20201029222738.482366-5-pmalani@chromium.org
---
 include/linux/platform_data/cros_ec_commands.h | 155 +++++++++++++++++++++++++
 1 file changed, 155 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h
index 1fcfe9e63cb9..7f54fdcdd8cb 100644
--- a/include/linux/platform_data/cros_ec_commands.h
+++ b/include/linux/platform_data/cros_ec_commands.h
@@ -1284,6 +1284,8 @@ enum ec_feature_code {
 	EC_FEATURE_SCP = 39,
 	/* The MCU is an Integrated Sensor Hub */
 	EC_FEATURE_ISH = 40,
+	/* New TCPMv2 TYPEC_ prefaced commands supported */
+	EC_FEATURE_TYPEC_CMD = 41,
 };
 
 #define EC_FEATURE_MASK_0(event_code) BIT(event_code % 32)
@@ -5528,6 +5530,159 @@ struct ec_response_regulator_get_voltage {
 	uint32_t voltage_mv;
 } __ec_align4;
 
+/*
+ * Gather all discovery information for the given port and partner type.
+ *
+ * Note that if discovery has not yet completed, only the currently completed
+ * responses will be filled in.   If the discovery data structures are changed
+ * in the process of the command running, BUSY will be returned.
+ *
+ * VDO field sizes are set to the maximum possible number of VDOs a VDM may
+ * contain, while the number of SVIDs here is selected to fit within the PROTO2
+ * maximum parameter size.
+ */
+#define EC_CMD_TYPEC_DISCOVERY 0x0131
+
+enum typec_partner_type {
+	TYPEC_PARTNER_SOP = 0,
+	TYPEC_PARTNER_SOP_PRIME = 1,
+};
+
+struct ec_params_typec_discovery {
+	uint8_t port;
+	uint8_t partner_type; /* enum typec_partner_type */
+} __ec_align1;
+
+struct svid_mode_info {
+	uint16_t svid;
+	uint16_t mode_count;  /* Number of modes partner sent */
+	uint32_t mode_vdo[6]; /* Max VDOs allowed after VDM header is 6 */
+};
+
+struct ec_response_typec_discovery {
+	uint8_t identity_count;    /* Number of identity VDOs partner sent */
+	uint8_t svid_count;	   /* Number of SVIDs partner sent */
+	uint16_t reserved;
+	uint32_t discovery_vdo[6]; /* Max VDOs allowed after VDM header is 6 */
+	struct svid_mode_info svids[0];
+} __ec_align1;
+
+/*
+ * Gather all status information for a port.
+ *
+ * Note: this covers many of the return fields from the deprecated
+ * EC_CMD_USB_PD_CONTROL command, except those that are redundant with the
+ * discovery data.  The "enum pd_cc_states" is defined with the deprecated
+ * EC_CMD_USB_PD_CONTROL command.
+ *
+ * This also combines in the EC_CMD_USB_PD_MUX_INFO flags.
+ */
+#define EC_CMD_TYPEC_STATUS 0x0133
+
+/*
+ * Power role.
+ *
+ * Note this is also used for PD header creation, and values align to those in
+ * the Power Delivery Specification Revision 3.0 (See
+ * 6.2.1.1.4 Port Power Role).
+ */
+enum pd_power_role {
+	PD_ROLE_SINK = 0,
+	PD_ROLE_SOURCE = 1
+};
+
+/*
+ * Data role.
+ *
+ * Note this is also used for PD header creation, and the first two values
+ * align to those in the Power Delivery Specification Revision 3.0 (See
+ * 6.2.1.1.6 Port Data Role).
+ */
+enum pd_data_role {
+	PD_ROLE_UFP = 0,
+	PD_ROLE_DFP = 1,
+	PD_ROLE_DISCONNECTED = 2,
+};
+
+enum pd_vconn_role {
+	PD_ROLE_VCONN_OFF = 0,
+	PD_ROLE_VCONN_SRC = 1,
+};
+
+/*
+ * Note: BIT(0) may be used to determine whether the polarity is CC1 or CC2,
+ * regardless of whether a debug accessory is connected.
+ */
+enum tcpc_cc_polarity {
+	/*
+	 * _CCx: is used to indicate the polarity while not connected to
+	 * a Debug Accessory.  Only one CC line will assert a resistor and
+	 * the other will be open.
+	 */
+	POLARITY_CC1 = 0,
+	POLARITY_CC2 = 1,
+
+	/*
+	 * _CCx_DTS is used to indicate the polarity while connected to a
+	 * SRC Debug Accessory.  Assert resistors on both lines.
+	 */
+	POLARITY_CC1_DTS = 2,
+	POLARITY_CC2_DTS = 3,
+
+	/*
+	 * The current TCPC code relies on these specific POLARITY values.
+	 * Adding in a check to verify if the list grows for any reason
+	 * that this will give a hint that other places need to be
+	 * adjusted.
+	 */
+	POLARITY_COUNT
+};
+
+#define PD_STATUS_EVENT_SOP_DISC_DONE		BIT(0)
+#define PD_STATUS_EVENT_SOP_PRIME_DISC_DONE	BIT(1)
+
+struct ec_params_typec_status {
+	uint8_t port;
+} __ec_align1;
+
+struct ec_response_typec_status {
+	uint8_t pd_enabled;		/* PD communication enabled - bool */
+	uint8_t dev_connected;		/* Device connected - bool */
+	uint8_t sop_connected;		/* Device is SOP PD capable - bool */
+	uint8_t source_cap_count;	/* Number of Source Cap PDOs */
+
+	uint8_t power_role;		/* enum pd_power_role */
+	uint8_t data_role;		/* enum pd_data_role */
+	uint8_t vconn_role;		/* enum pd_vconn_role */
+	uint8_t sink_cap_count;		/* Number of Sink Cap PDOs */
+
+	uint8_t polarity;		/* enum tcpc_cc_polarity */
+	uint8_t cc_state;		/* enum pd_cc_states */
+	uint8_t dp_pin;			/* DP pin mode (MODE_DP_IN_[A-E]) */
+	uint8_t mux_state;		/* USB_PD_MUX* - encoded mux state */
+
+	char tc_state[32];		/* TC state name */
+
+	uint32_t events;		/* PD_STATUS_EVENT bitmask */
+
+	/*
+	 * BCD PD revisions for partners
+	 *
+	 * The format has the PD major reversion in the upper nibble, and PD
+	 * minor version in the next nibble.  Following two nibbles are
+	 * currently 0.
+	 * ex. PD 3.2 would map to 0x3200
+	 *
+	 * PD major/minor will be 0 if no PD device is connected.
+	 */
+	uint16_t sop_revision;
+	uint16_t sop_prime_revision;
+
+	uint32_t source_cap_pdos[7];	/* Max 7 PDOs can be present */
+
+	uint32_t sink_cap_pdos[7];	/* Max 7 PDOs can be present */
+} __ec_align1;
+
 /*****************************************************************************/
 /* The command range 0x200-0x2FF is reserved for Rotor. */
 
-- 
cgit v1.2.3


From d19ad0775dcd64b49eecf4fa79c17959ebfbd26b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 28 Oct 2020 17:42:17 -0400
Subject: ftrace: Have the callbacks receive a struct ftrace_regs instead of
 pt_regs

In preparation to have arguments of a function passed to callbacks attached
to functions as default, change the default callback prototype to receive a
struct ftrace_regs as the forth parameter instead of a pt_regs.

For callbacks that set the FL_SAVE_REGS flag in their ftrace_ops flags, they
will now need to get the pt_regs via a ftrace_get_regs() helper call. If
this is called by a callback that their ftrace_ops did not have a
FL_SAVE_REGS flag set, it that helper function will return NULL.

This will allow the ftrace_regs to hold enough just to get the parameters
and stack pointer, but without the worry that callbacks may have a pt_regs
that is not completely filled.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/csky/kernel/probes/ftrace.c     |  4 +++-
 arch/nds32/kernel/ftrace.c           |  4 ++--
 arch/parisc/kernel/ftrace.c          |  8 +++++---
 arch/powerpc/kernel/kprobes-ftrace.c |  4 +++-
 arch/s390/kernel/ftrace.c            |  4 +++-
 arch/x86/kernel/kprobes/ftrace.c     |  3 ++-
 fs/pstore/ftrace.c                   |  2 +-
 include/linux/ftrace.h               | 16 ++++++++++++++--
 include/linux/kprobes.h              |  2 +-
 kernel/livepatch/patch.c             |  3 ++-
 kernel/trace/ftrace.c                | 27 +++++++++++++++------------
 kernel/trace/trace_event_perf.c      |  2 +-
 kernel/trace/trace_events.c          |  2 +-
 kernel/trace/trace_functions.c       |  9 ++++-----
 kernel/trace/trace_irqsoff.c         |  2 +-
 kernel/trace/trace_sched_wakeup.c    |  2 +-
 kernel/trace/trace_selftest.c        | 20 +++++++++++---------
 kernel/trace/trace_stack.c           |  2 +-
 18 files changed, 71 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c
index f30b179924ef..ae2b1c7b3b5c 100644
--- a/arch/csky/kernel/probes/ftrace.c
+++ b/arch/csky/kernel/probes/ftrace.c
@@ -11,17 +11,19 @@ int arch_check_ftrace_location(struct kprobe *p)
 
 /* Ftrace callback handler for kprobes -- called under preepmt disabed */
 void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-			   struct ftrace_ops *ops, struct pt_regs *regs)
+			   struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
 	int bit;
 	bool lr_saver = false;
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
+	struct pt_regs *regs;
 
 	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
 
+	regs = ftrace_get_regs(fregs);
 	preempt_disable_notrace();
 	p = get_kprobe((kprobe_opcode_t *)ip);
 	if (!p) {
diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c
index 3763b3f8c3db..414f8a780cc3 100644
--- a/arch/nds32/kernel/ftrace.c
+++ b/arch/nds32/kernel/ftrace.c
@@ -10,7 +10,7 @@ extern void (*ftrace_trace_function)(unsigned long, unsigned long,
 extern void ftrace_graph_caller(void);
 
 noinline void __naked ftrace_stub(unsigned long ip, unsigned long parent_ip,
-				  struct ftrace_ops *op, struct pt_regs *regs)
+				  struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	__asm__ ("");  /* avoid to optimize as pure function */
 }
@@ -38,7 +38,7 @@ EXPORT_SYMBOL(_mcount);
 #else /* CONFIG_DYNAMIC_FTRACE */
 
 noinline void __naked ftrace_stub(unsigned long ip, unsigned long parent_ip,
-				  struct ftrace_ops *op, struct pt_regs *regs)
+				  struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	__asm__ ("");  /* avoid to optimize as pure function */
 }
diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
index 1c5d3732bda2..0a1e75af5382 100644
--- a/arch/parisc/kernel/ftrace.c
+++ b/arch/parisc/kernel/ftrace.c
@@ -51,7 +51,7 @@ static void __hot prepare_ftrace_return(unsigned long *parent,
 void notrace __hot ftrace_function_trampoline(unsigned long parent,
 				unsigned long self_addr,
 				unsigned long org_sp_gr3,
-				struct pt_regs *regs)
+				struct ftrace_regs *fregs)
 {
 #ifndef CONFIG_DYNAMIC_FTRACE
 	extern ftrace_func_t ftrace_trace_function;
@@ -61,7 +61,7 @@ void notrace __hot ftrace_function_trampoline(unsigned long parent,
 	if (function_trace_op->flags & FTRACE_OPS_FL_ENABLED &&
 	    ftrace_trace_function != ftrace_stub)
 		ftrace_trace_function(self_addr, parent,
-				function_trace_op, regs);
+				function_trace_op, fregs);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	if (dereference_function_descriptor(ftrace_graph_return) !=
@@ -204,9 +204,10 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
 
 #ifdef CONFIG_KPROBES_ON_FTRACE
 void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-			   struct ftrace_ops *ops, struct pt_regs *regs)
+			   struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
 	struct kprobe_ctlblk *kcb;
+	struct pt_regs *regs;
 	struct kprobe *p;
 	int bit;
 
@@ -214,6 +215,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	if (bit < 0)
 		return;
 
+	regs = ftrace_get_regs(fregs);
 	preempt_disable_notrace();
 	p = get_kprobe((kprobe_opcode_t *)ip);
 	if (unlikely(!p) || kprobe_disabled(p))
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
index fdfee39938ea..660138f6c4b2 100644
--- a/arch/powerpc/kernel/kprobes-ftrace.c
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -14,16 +14,18 @@
 
 /* Ftrace callback handler for kprobes */
 void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
-			   struct ftrace_ops *ops, struct pt_regs *regs)
+			   struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
+	struct pt_regs *regs;
 	int bit;
 
 	bit = ftrace_test_recursion_trylock(nip, parent_nip);
 	if (bit < 0)
 		return;
 
+	regs = ftrace_get_regs(fregs);
 	preempt_disable_notrace();
 	p = get_kprobe((kprobe_opcode_t *)nip);
 	if (unlikely(!p) || kprobe_disabled(p))
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 657c1ab45408..67b80f4412f9 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -198,9 +198,10 @@ int ftrace_disable_ftrace_graph_caller(void)
 
 #ifdef CONFIG_KPROBES_ON_FTRACE
 void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-		struct ftrace_ops *ops, struct pt_regs *regs)
+		struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
 	struct kprobe_ctlblk *kcb;
+	struct pt_regs *regs;
 	struct kprobe *p;
 	int bit;
 
@@ -208,6 +209,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	if (bit < 0)
 		return;
 
+	regs = ftrace_get_regs(fregs);
 	preempt_disable_notrace();
 	p = get_kprobe((kprobe_opcode_t *)ip);
 	if (unlikely(!p) || kprobe_disabled(p))
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 954d930a7127..373e5fa3ce1f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -14,8 +14,9 @@
 
 /* Ftrace callback handler for kprobes -- called under preepmt disabed */
 void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-			   struct ftrace_ops *ops, struct pt_regs *regs)
+			   struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
+	struct pt_regs *regs = ftrace_get_regs(fregs);
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
 	int bit;
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index adb0935eb062..5939595f0115 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -26,7 +26,7 @@ static u64 pstore_ftrace_stamp;
 static void notrace pstore_ftrace_call(unsigned long ip,
 				       unsigned long parent_ip,
 				       struct ftrace_ops *op,
-				       struct pt_regs *regs)
+				       struct ftrace_regs *fregs)
 {
 	int bit;
 	unsigned long flags;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 8dde9c17aaa5..24e1fa52337d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -90,8 +90,20 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 struct ftrace_ops;
 
+struct ftrace_regs {
+	struct pt_regs		regs;
+};
+
+static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs)
+{
+	if (!fregs)
+		return NULL;
+
+	return &fregs->regs;
+}
+
 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
-			      struct ftrace_ops *op, struct pt_regs *regs);
+			      struct ftrace_ops *op, struct ftrace_regs *fregs);
 
 ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
 
@@ -259,7 +271,7 @@ int register_ftrace_function(struct ftrace_ops *ops);
 int unregister_ftrace_function(struct ftrace_ops *ops);
 
 extern void ftrace_stub(unsigned long a0, unsigned long a1,
-			struct ftrace_ops *op, struct pt_regs *regs);
+			struct ftrace_ops *op, struct ftrace_regs *fregs);
 
 #else /* !CONFIG_FUNCTION_TRACER */
 /*
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 629abaf25681..be73350955e4 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -345,7 +345,7 @@ static inline void wait_for_kprobe_optimizer(void) { }
 #endif /* CONFIG_OPTPROBES */
 #ifdef CONFIG_KPROBES_ON_FTRACE
 extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
-				  struct ftrace_ops *ops, struct pt_regs *regs);
+				  struct ftrace_ops *ops, struct ftrace_regs *fregs);
 extern int arch_prepare_kprobe_ftrace(struct kprobe *p);
 #endif
 
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 875c5dbbdd33..f89f9e7e9b07 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -40,8 +40,9 @@ struct klp_ops *klp_find_ops(void *old_func)
 static void notrace klp_ftrace_handler(unsigned long ip,
 				       unsigned long parent_ip,
 				       struct ftrace_ops *fops,
-				       struct pt_regs *regs)
+				       struct ftrace_regs *fregs)
 {
+	struct pt_regs *regs = ftrace_get_regs(fregs);
 	struct klp_ops *ops;
 	struct klp_func *func;
 	int patch_state;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3db64fb0cce8..67888311784e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -121,7 +121,7 @@ struct ftrace_ops global_ops;
 
 #if ARCH_SUPPORTS_FTRACE_OPS
 static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
-				 struct ftrace_ops *op, struct pt_regs *regs);
+				 struct ftrace_ops *op, struct ftrace_regs *fregs);
 #else
 /* See comment below, where ftrace_ops_list_func is defined */
 static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
@@ -140,7 +140,7 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
 }
 
 static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
-			    struct ftrace_ops *op, struct pt_regs *regs)
+			    struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct trace_array *tr = op->private;
 	int pid;
@@ -154,7 +154,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
 			return;
 	}
 
-	op->saved_func(ip, parent_ip, op, regs);
+	op->saved_func(ip, parent_ip, op, fregs);
 }
 
 static void ftrace_sync_ipi(void *data)
@@ -754,7 +754,7 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
 
 static void
 function_profile_call(unsigned long ip, unsigned long parent_ip,
-		      struct ftrace_ops *ops, struct pt_regs *regs)
+		      struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
 	struct ftrace_profile_stat *stat;
 	struct ftrace_profile *rec;
@@ -2143,6 +2143,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 				else
 					rec->flags &= ~FTRACE_FL_TRAMP_EN;
 			}
+
 			if (flag & FTRACE_FL_DIRECT) {
 				/*
 				 * If there's only one user (direct_ops helper)
@@ -2368,8 +2369,9 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
 }
 
 static void call_direct_funcs(unsigned long ip, unsigned long pip,
-			      struct ftrace_ops *ops, struct pt_regs *regs)
+			      struct ftrace_ops *ops, struct ftrace_regs *fregs)
 {
+	struct pt_regs *regs = ftrace_get_regs(fregs);
 	unsigned long addr;
 
 	addr = ftrace_find_rec_direct(ip);
@@ -4292,7 +4294,7 @@ static int __init ftrace_mod_cmd_init(void)
 core_initcall(ftrace_mod_cmd_init);
 
 static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
-				      struct ftrace_ops *op, struct pt_regs *pt_regs)
+				      struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct ftrace_probe_ops *probe_ops;
 	struct ftrace_func_probe *probe;
@@ -6911,8 +6913,9 @@ void ftrace_reset_array_ops(struct trace_array *tr)
 
 static nokprobe_inline void
 __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
-		       struct ftrace_ops *ignored, struct pt_regs *regs)
+		       struct ftrace_ops *ignored, struct ftrace_regs *fregs)
 {
+	struct pt_regs *regs = ftrace_get_regs(fregs);
 	struct ftrace_ops *op;
 	int bit;
 
@@ -6945,7 +6948,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 				pr_warn("op=%p %pS\n", op, op);
 				goto out;
 			}
-			op->func(ip, parent_ip, op, regs);
+			op->func(ip, parent_ip, op, fregs);
 		}
 	} while_for_each_ftrace_op(op);
 out:
@@ -6968,9 +6971,9 @@ out:
  */
 #if ARCH_SUPPORTS_FTRACE_OPS
 static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
-				 struct ftrace_ops *op, struct pt_regs *regs)
+				 struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-	__ftrace_ops_list_func(ip, parent_ip, NULL, regs);
+	__ftrace_ops_list_func(ip, parent_ip, NULL, fregs);
 }
 NOKPROBE_SYMBOL(ftrace_ops_list_func);
 #else
@@ -6987,7 +6990,7 @@ NOKPROBE_SYMBOL(ftrace_ops_no_ops);
  * this function will be called by the mcount trampoline.
  */
 static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
-				   struct ftrace_ops *op, struct pt_regs *regs)
+				   struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	int bit;
 
@@ -6998,7 +7001,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
 	preempt_disable_notrace();
 
 	if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching())
-		op->func(ip, parent_ip, op, regs);
+		op->func(ip, parent_ip, op, fregs);
 
 	preempt_enable_notrace();
 	trace_clear_recursion(bit);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 1b202e28dfaa..a71181655958 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -432,7 +432,7 @@ NOKPROBE_SYMBOL(perf_trace_buf_update);
 #ifdef CONFIG_FUNCTION_TRACER
 static void
 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
-			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
+			  struct ftrace_ops *ops,  struct ftrace_regs *fregs)
 {
 	struct ftrace_entry *entry;
 	struct perf_event *event;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f4b459bb6d33..98d194d8460e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3673,7 +3673,7 @@ static struct trace_event_file event_trace_file __initdata;
 
 static void __init
 function_test_events_call(unsigned long ip, unsigned long parent_ip,
-			  struct ftrace_ops *op, struct pt_regs *pt_regs)
+			  struct ftrace_ops *op, struct ftrace_regs *regs)
 {
 	struct trace_buffer *buffer;
 	struct ring_buffer_event *event;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 646eda6c44a5..c5095dd28e20 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -23,10 +23,10 @@ static void tracing_start_function_trace(struct trace_array *tr);
 static void tracing_stop_function_trace(struct trace_array *tr);
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip,
-		    struct ftrace_ops *op, struct pt_regs *pt_regs);
+		    struct ftrace_ops *op, struct ftrace_regs *fregs);
 static void
 function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
-			  struct ftrace_ops *op, struct pt_regs *pt_regs);
+			  struct ftrace_ops *op, struct ftrace_regs *fregs);
 static struct tracer_flags func_flags;
 
 /* Our option */
@@ -89,7 +89,6 @@ void ftrace_destroy_function_files(struct trace_array *tr)
 static int function_trace_init(struct trace_array *tr)
 {
 	ftrace_func_t func;
-
 	/*
 	 * Instance trace_arrays get their ops allocated
 	 * at instance creation. Unless it failed
@@ -129,7 +128,7 @@ static void function_trace_start(struct trace_array *tr)
 
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip,
-		    struct ftrace_ops *op, struct pt_regs *pt_regs)
+		    struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct trace_array *tr = op->private;
 	struct trace_array_cpu *data;
@@ -178,7 +177,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 
 static void
 function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
-			  struct ftrace_ops *op, struct pt_regs *pt_regs)
+			  struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct trace_array *tr = op->private;
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 10bbb0f381d5..d06aab4dcbb8 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -138,7 +138,7 @@ static int func_prolog_dec(struct trace_array *tr,
  */
 static void
 irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
-		    struct ftrace_ops *op, struct pt_regs *pt_regs)
+		    struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct trace_array *tr = irqsoff_trace;
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 97b10bb31a1f..c0181066dbe9 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -212,7 +212,7 @@ static void wakeup_print_header(struct seq_file *s)
  */
 static void
 wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
-		   struct ftrace_ops *op, struct pt_regs *pt_regs)
+		   struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	struct trace_array *tr = wakeup_trace;
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 8ee3c0bb5d8a..5ed081c6471c 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -107,7 +107,7 @@ static int trace_selftest_test_probe1_cnt;
 static void trace_selftest_test_probe1_func(unsigned long ip,
 					    unsigned long pip,
 					    struct ftrace_ops *op,
-					    struct pt_regs *pt_regs)
+					    struct ftrace_regs *fregs)
 {
 	trace_selftest_test_probe1_cnt++;
 }
@@ -116,7 +116,7 @@ static int trace_selftest_test_probe2_cnt;
 static void trace_selftest_test_probe2_func(unsigned long ip,
 					    unsigned long pip,
 					    struct ftrace_ops *op,
-					    struct pt_regs *pt_regs)
+					    struct ftrace_regs *fregs)
 {
 	trace_selftest_test_probe2_cnt++;
 }
@@ -125,7 +125,7 @@ static int trace_selftest_test_probe3_cnt;
 static void trace_selftest_test_probe3_func(unsigned long ip,
 					    unsigned long pip,
 					    struct ftrace_ops *op,
-					    struct pt_regs *pt_regs)
+					    struct ftrace_regs *fregs)
 {
 	trace_selftest_test_probe3_cnt++;
 }
@@ -134,7 +134,7 @@ static int trace_selftest_test_global_cnt;
 static void trace_selftest_test_global_func(unsigned long ip,
 					    unsigned long pip,
 					    struct ftrace_ops *op,
-					    struct pt_regs *pt_regs)
+					    struct ftrace_regs *fregs)
 {
 	trace_selftest_test_global_cnt++;
 }
@@ -143,7 +143,7 @@ static int trace_selftest_test_dyn_cnt;
 static void trace_selftest_test_dyn_func(unsigned long ip,
 					 unsigned long pip,
 					 struct ftrace_ops *op,
-					 struct pt_regs *pt_regs)
+					 struct ftrace_regs *fregs)
 {
 	trace_selftest_test_dyn_cnt++;
 }
@@ -414,7 +414,7 @@ static int trace_selftest_recursion_cnt;
 static void trace_selftest_test_recursion_func(unsigned long ip,
 					       unsigned long pip,
 					       struct ftrace_ops *op,
-					       struct pt_regs *pt_regs)
+					       struct ftrace_regs *fregs)
 {
 	/*
 	 * This function is registered without the recursion safe flag.
@@ -429,7 +429,7 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
 static void trace_selftest_test_recursion_safe_func(unsigned long ip,
 						    unsigned long pip,
 						    struct ftrace_ops *op,
-						    struct pt_regs *pt_regs)
+						    struct ftrace_regs *fregs)
 {
 	/*
 	 * We said we would provide our own recursion. By calling
@@ -548,9 +548,11 @@ static enum {
 static void trace_selftest_test_regs_func(unsigned long ip,
 					  unsigned long pip,
 					  struct ftrace_ops *op,
-					  struct pt_regs *pt_regs)
+					  struct ftrace_regs *fregs)
 {
-	if (pt_regs)
+	struct pt_regs *regs = ftrace_get_regs(fregs);
+
+	if (regs)
 		trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND;
 	else
 		trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 969db526a563..63c285042051 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -290,7 +290,7 @@ static void check_stack(unsigned long ip, unsigned long *stack)
 
 static void
 stack_trace_call(unsigned long ip, unsigned long parent_ip,
-		 struct ftrace_ops *op, struct pt_regs *pt_regs)
+		 struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
 	unsigned long stack;
 
-- 
cgit v1.2.3


From 02a474ca266a47ea8f4d5a11f4ffa120f83730ad Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 27 Oct 2020 10:55:55 -0400
Subject: ftrace/x86: Allow for arguments to be passed in to ftrace_regs by
 default

Currently, the only way to get access to the registers of a function via a
ftrace callback is to set the "FL_SAVE_REGS" bit in the ftrace_ops. But as this
saves all regs as if a breakpoint were to trigger (for use with kprobes), it
is expensive.

The regs are already saved on the stack for the default ftrace callbacks, as
that is required otherwise a function being traced will get the wrong
arguments and possibly crash. And on x86, the arguments are already stored
where they would be on a pt_regs structure to use that code for both the
regs version of a callback, it makes sense to pass that information always
to all functions.

If an architecture does this (as x86_64 now does), it is to set
HAVE_DYNAMIC_FTRACE_WITH_ARGS, and this will let the generic code that it
could have access to arguments without having to set the flags.

This also includes having the stack pointer being saved, which could be used
for accessing arguments on the stack, as well as having the function graph
tracer not require its own trampoline!

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/Kconfig              |  1 +
 arch/x86/include/asm/ftrace.h | 15 +++++++++++++++
 arch/x86/kernel/ftrace_64.S   | 11 +++++++++--
 include/linux/ftrace.h        |  7 ++++++-
 kernel/trace/Kconfig          |  9 +++++++++
 5 files changed, 40 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f6946b81f74a..478526aabe5d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -167,6 +167,7 @@ config X86
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
+	select HAVE_DYNAMIC_FTRACE_WITH_ARGS	if X86_64
 	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 	select HAVE_EBPF_JIT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 84b9449be080..e00fe88146e0 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -41,6 +41,21 @@ static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned
 	regs->orig_ax = addr;
 }
 
+#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+struct ftrace_regs {
+	struct pt_regs		regs;
+};
+
+static __always_inline struct pt_regs *
+arch_ftrace_get_regs(struct ftrace_regs *fregs)
+{
+	/* Only when FL_SAVE_REGS is set, cs will be non zero */
+	if (!fregs->regs.cs)
+		return NULL;
+	return &fregs->regs;
+}
+#endif
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 struct dyn_arch_ftrace {
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index ac3d5f22fe64..60e3b64f5ea6 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -140,12 +140,19 @@ SYM_FUNC_START(ftrace_caller)
 	/* save_mcount_regs fills in first two parameters */
 	save_mcount_regs
 
+	/* Stack - skipping return address of ftrace_caller */
+	leaq MCOUNT_REG_SIZE+8(%rsp), %rcx
+	movq %rcx, RSP(%rsp)
+
 SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
 	/* Load the ftrace_ops into the 3rd parameter */
 	movq function_trace_op(%rip), %rdx
 
-	/* regs go into 4th parameter (but make it NULL) */
-	movq $0, %rcx
+	/* regs go into 4th parameter */
+	leaq (%rsp), %rcx
+
+	/* Only ops with REGS flag set should have CS register set */
+	movq $0, CS(%rsp)
 
 SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
 	call ftrace_stub
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 24e1fa52337d..588ea7023a7a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -90,16 +90,21 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 struct ftrace_ops;
 
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+
 struct ftrace_regs {
 	struct pt_regs		regs;
 };
+#define arch_ftrace_get_regs(fregs) (&(fregs)->regs)
+
+#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
 
 static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs)
 {
 	if (!fregs)
 		return NULL;
 
-	return &fregs->regs;
+	return arch_ftrace_get_regs(fregs);
 }
 
 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6aa36ec73ccb..c9b64dea1216 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -31,6 +31,15 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS
 config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 	bool
 
+config HAVE_DYNAMIC_FTRACE_WITH_ARGS
+	bool
+	help
+	 If this is set, then arguments and stack can be found from
+	 the pt_regs passed into the function callback regs parameter
+	 by default, even without setting the REGS flag in the ftrace_ops.
+	 This allows for use of regs_get_kernel_argument() and
+	 kernel_stack_pointer().
+
 config HAVE_FTRACE_MCOUNT_RECORD
 	bool
 	help
-- 
cgit v1.2.3


From 2860cd8a235375df3c8ec8039d9fe5eb2f658b86 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 28 Oct 2020 17:15:27 -0400
Subject: livepatch: Use the default ftrace_ops instead of REGS when ARGS is
 available

When CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS is available, the ftrace call
will be able to set the ip of the calling function. This will improve the
performance of live kernel patching where it does not need all the regs to
be stored just to change the instruction pointer.

If all archs that support live kernel patching also support
HAVE_DYNAMIC_FTRACE_WITH_ARGS, then the architecture specific function
klp_arch_set_pc() could be made generic.

It is possible that an arch can support HAVE_DYNAMIC_FTRACE_WITH_ARGS but
not HAVE_DYNAMIC_FTRACE_WITH_REGS and then have access to live patching.

Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: live-patching@vger.kernel.org
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/powerpc/include/asm/livepatch.h | 4 +++-
 arch/s390/include/asm/livepatch.h    | 5 ++++-
 arch/x86/include/asm/ftrace.h        | 3 +++
 arch/x86/include/asm/livepatch.h     | 4 ++--
 arch/x86/kernel/ftrace_64.S          | 4 ++++
 include/linux/ftrace.h               | 7 +++++++
 kernel/livepatch/Kconfig             | 2 +-
 kernel/livepatch/patch.c             | 9 +++++----
 8 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h
index 4a3d5d25fed5..ae25e6e72997 100644
--- a/arch/powerpc/include/asm/livepatch.h
+++ b/arch/powerpc/include/asm/livepatch.h
@@ -12,8 +12,10 @@
 #include <linux/sched/task_stack.h>
 
 #ifdef CONFIG_LIVEPATCH
-static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip)
 {
+	struct pt_regs *regs = ftrace_get_regs(fregs);
+
 	regs->nip = ip;
 }
 
diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h
index 818612b784cd..d578a8c76676 100644
--- a/arch/s390/include/asm/livepatch.h
+++ b/arch/s390/include/asm/livepatch.h
@@ -11,10 +11,13 @@
 #ifndef ASM_LIVEPATCH_H
 #define ASM_LIVEPATCH_H
 
+#include <linux/ftrace.h>
 #include <asm/ptrace.h>
 
-static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip)
 {
+	struct pt_regs *regs = ftrace_get_regs(fregs);
+
 	regs->psw.addr = ip;
 }
 
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index e00fe88146e0..9f3130f40807 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -54,6 +54,9 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs)
 		return NULL;
 	return &fregs->regs;
 }
+
+#define ftrace_instruction_pointer_set(fregs, _ip)	\
+	do { (fregs)->regs.ip = (_ip); } while (0)
 #endif
 
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index 1fde1ab6559e..7c5cc6660e4b 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -12,9 +12,9 @@
 #include <asm/setup.h>
 #include <linux/ftrace.h>
 
-static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip)
 {
-	regs->ip = ip;
+	ftrace_instruction_pointer_set(fregs, ip);
 }
 
 #endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 60e3b64f5ea6..0d54099c2a3a 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -157,6 +157,10 @@ SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
 SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
 	call ftrace_stub
 
+	/* Handlers can change the RIP */
+	movq RIP(%rsp), %rax
+	movq %rax, MCOUNT_REG_SIZE(%rsp)
+
 	restore_mcount_regs
 
 	/*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 588ea7023a7a..9a8ce28e4485 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -97,6 +97,13 @@ struct ftrace_regs {
 };
 #define arch_ftrace_get_regs(fregs) (&(fregs)->regs)
 
+/*
+ * ftrace_instruction_pointer_set() is to be defined by the architecture
+ * if to allow setting of the instruction pointer from the ftrace_regs
+ * when HAVE_DYNAMIC_FTRACE_WITH_ARGS is set and it supports
+ * live kernel patching.
+ */
+#define ftrace_instruction_pointer_set(fregs, ip) do { } while (0)
 #endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
 
 static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs)
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 54102deb50ba..53d51ed619a3 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -6,7 +6,7 @@ config HAVE_LIVEPATCH
 
 config LIVEPATCH
 	bool "Kernel Live Patching"
-	depends on DYNAMIC_FTRACE_WITH_REGS
+	depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
 	depends on MODULES
 	depends on SYSFS
 	depends on KALLSYMS_ALL
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index f89f9e7e9b07..e8029aea67f1 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -42,7 +42,6 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 				       struct ftrace_ops *fops,
 				       struct ftrace_regs *fregs)
 {
-	struct pt_regs *regs = ftrace_get_regs(fregs);
 	struct klp_ops *ops;
 	struct klp_func *func;
 	int patch_state;
@@ -118,7 +117,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 	if (func->nop)
 		goto unlock;
 
-	klp_arch_set_pc(regs, (unsigned long)func->new_func);
+	klp_arch_set_pc(fregs, (unsigned long)func->new_func);
 
 unlock:
 	preempt_enable_notrace();
@@ -200,8 +199,10 @@ static int klp_patch_func(struct klp_func *func)
 			return -ENOMEM;
 
 		ops->fops.func = klp_ftrace_handler;
-		ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
-				  FTRACE_OPS_FL_DYNAMIC |
+		ops->fops.flags = FTRACE_OPS_FL_DYNAMIC |
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
+				  FTRACE_OPS_FL_SAVE_REGS |
+#endif
 				  FTRACE_OPS_FL_IPMODIFY |
 				  FTRACE_OPS_FL_PERMANENT;
 
-- 
cgit v1.2.3


From 30d6f8c15d2cd877c1f3d47d8a1064649ebe58e2 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Wed, 21 Oct 2020 18:21:46 +0200
Subject: clk: add api to get clk consumer from clk_hw

clk_register() is deprecated. Using 'clk' member of struct clk_hw is
discouraged. With this constraint, it is difficult for driver to
register clocks using the clk_hw API and then use the clock with
the consumer API

This adds a simple helper, clk_hw_get_clk(), to get a struct clk from
a struct clk_hw. Like other clk_get() variant, each call to this helper
must be balanced with a call to clk_put(). To make life easier on the
consumers, a memory managed version is provided as well.

Cc: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://lore.kernel.org/r/20201021162147.563655-3-jbrunet@baylibre.com
Tested-by: Kevin Hilman <khilman@baylibre.com>
[sboyd@kernel.org: Fix kernel-doc]
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.c            | 61 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h |  5 ++++
 2 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index f70dc0ef1cdd..48931f442de8 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -3667,6 +3667,24 @@ struct clk *clk_hw_create_clk(struct device *dev, struct clk_hw *hw,
 	return clk;
 }
 
+/**
+ * clk_hw_get_clk - get clk consumer given an clk_hw
+ * @hw: clk_hw associated with the clk being consumed
+ * @con_id: connection ID string on device
+ *
+ * Returns: new clk consumer
+ * This is the function to be used by providers which need
+ * to get a consumer clk and act on the clock element
+ * Calls to this function must be balanced with calls clk_put()
+ */
+struct clk *clk_hw_get_clk(struct clk_hw *hw, const char *con_id)
+{
+	struct device *dev = hw->core->dev;
+
+	return clk_hw_create_clk(dev, hw, dev_name(dev), con_id);
+}
+EXPORT_SYMBOL(clk_hw_get_clk);
+
 static int clk_cpy_name(const char **dst_p, const char *src, bool must_exist)
 {
 	const char *dst;
@@ -4187,6 +4205,49 @@ void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw)
 }
 EXPORT_SYMBOL_GPL(devm_clk_hw_unregister);
 
+static void devm_clk_release(struct device *dev, void *res)
+{
+	clk_put(*(struct clk **)res);
+}
+
+/**
+ * devm_clk_hw_get_clk - resource managed clk_hw_get_clk()
+ * @dev: device that is registering this clock
+ * @hw: clk_hw associated with the clk being consumed
+ * @con_id: connection ID string on device
+ *
+ * Managed clk_hw_get_clk(). Clocks got with this function are
+ * automatically clk_put() on driver detach. See clk_put()
+ * for more information.
+ */
+struct clk *devm_clk_hw_get_clk(struct device *dev, struct clk_hw *hw,
+				const char *con_id)
+{
+	struct clk *clk;
+	struct clk **clkp;
+
+	/* This should not happen because it would mean we have drivers
+	 * passing around clk_hw pointers instead of having the caller use
+	 * proper clk_get() style APIs
+	 */
+	WARN_ON_ONCE(dev != hw->core->dev);
+
+	clkp = devres_alloc(devm_clk_release, sizeof(*clkp), GFP_KERNEL);
+	if (!clkp)
+		return ERR_PTR(-ENOMEM);
+
+	clk = clk_hw_get_clk(hw, con_id);
+	if (!IS_ERR(clk)) {
+		*clkp = clk;
+		devres_add(dev, clkp);
+	} else {
+		devres_free(clkp);
+	}
+
+	return clk;
+}
+EXPORT_SYMBOL_GPL(devm_clk_hw_get_clk);
+
 /*
  * clkdev helpers
  */
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 03a5de5f99f4..86b707520ec0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1088,6 +1088,11 @@ static inline struct clk_hw *__clk_get_hw(struct clk *clk)
 	return (struct clk_hw *)clk;
 }
 #endif
+
+struct clk *clk_hw_get_clk(struct clk_hw *hw, const char *con_id);
+struct clk *devm_clk_hw_get_clk(struct device *dev, struct clk_hw *hw,
+				const char *con_id);
+
 unsigned int clk_hw_get_num_parents(const struct clk_hw *hw);
 struct clk_hw *clk_hw_get_parent(const struct clk_hw *hw);
 struct clk_hw *clk_hw_get_parent_by_index(const struct clk_hw *hw,
-- 
cgit v1.2.3


From 6d30d50d037dfa092f9d5d1fffa348ab4abb7163 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Wed, 21 Oct 2020 18:38:46 +0200
Subject: clk: add devm variant of clk_notifier_register

Add a memory managed variant of clk_notifier_register() to make life easier
on clock consumers using notifiers

Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://lore.kernel.org/r/20201021163847.595189-2-jbrunet@baylibre.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.c   | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/clk.h | 10 ++++++++++
 2 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 48931f442de8..6cf59e3c31b4 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -4395,6 +4395,42 @@ int clk_notifier_unregister(struct clk *clk, struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(clk_notifier_unregister);
 
+struct clk_notifier_devres {
+	struct clk *clk;
+	struct notifier_block *nb;
+};
+
+static void devm_clk_notifier_release(struct device *dev, void *res)
+{
+	struct clk_notifier_devres *devres = res;
+
+	clk_notifier_unregister(devres->clk, devres->nb);
+}
+
+int devm_clk_notifier_register(struct device *dev, struct clk *clk,
+			       struct notifier_block *nb)
+{
+	struct clk_notifier_devres *devres;
+	int ret;
+
+	devres = devres_alloc(devm_clk_notifier_release,
+			      sizeof(*devres), GFP_KERNEL);
+
+	if (!devres)
+		return -ENOMEM;
+
+	ret = clk_notifier_register(clk, nb);
+	if (!ret) {
+		devres->clk = clk;
+		devres->nb = nb;
+	} else {
+		devres_free(devres);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devm_clk_notifier_register);
+
 #ifdef CONFIG_OF
 static void clk_core_reparent_orphans(void)
 {
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 7fd6a1febcf4..f53afdf8198b 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -109,6 +109,16 @@ int clk_notifier_register(struct clk *clk, struct notifier_block *nb);
  */
 int clk_notifier_unregister(struct clk *clk, struct notifier_block *nb);
 
+/**
+ * devm_clk_notifier_register - register a managed rate-change notifier callback
+ * @dev: device for clock "consumer"
+ * @clk: clock whose rate we are interested in
+ * @nb: notifier block with callback function pointer
+ *
+ * Returns 0 on success, -EERROR otherwise
+ */
+int devm_clk_notifier_register(struct device *dev, struct clk *clk, struct notifier_block *nb);
+
 /**
  * clk_get_accuracy - obtain the clock accuracy in ppb (parts per billion)
  *		      for a clock source.
-- 
cgit v1.2.3


From e6fb7aee486c7fbd4d94f4894feaa6f0424c1740 Mon Sep 17 00:00:00 2001
From: Jerome Brunet <jbrunet@baylibre.com>
Date: Wed, 21 Oct 2020 18:38:47 +0200
Subject: clk: meson: g12: use devm variant to register notifiers

Until now, nothing was done to unregister the dvfs clock notifiers of the
Amlogic g12 SoC family. This is not great but this driver was not really
expected to be unloaded. With the ongoing effort to build everything as
module for this platform, this needs to be cleanly handled.

Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
Link: https://lore.kernel.org/r/20201021163847.595189-3-jbrunet@baylibre.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/meson/g12a.c | 34 ++++++++++++++++++++--------------
 include/linux/clk.h      | 10 +++++++++-
 2 files changed, 29 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/meson/g12a.c b/drivers/clk/meson/g12a.c
index 235dcf72e34a..108e4491b1e2 100644
--- a/drivers/clk/meson/g12a.c
+++ b/drivers/clk/meson/g12a.c
@@ -5171,8 +5171,8 @@ static int meson_g12a_dvfs_setup_common(struct device *dev,
 	g12a_cpu_clk_postmux0_nb_data.xtal = xtal;
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12a_cpu_clk_postmux0.hw,
 					   DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk,
-				    &g12a_cpu_clk_postmux0_nb_data.nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12a_cpu_clk_postmux0_nb_data.nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpu_clk_postmux0 notifier\n");
 		return ret;
@@ -5181,7 +5181,8 @@ static int meson_g12a_dvfs_setup_common(struct device *dev,
 	/* Setup clock notifier for cpu_clk_dyn mux */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12a_cpu_clk_dyn.hw,
 					   DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12a_cpu_clk_mux_nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpu_clk_dyn notifier\n");
 		return ret;
@@ -5207,7 +5208,8 @@ static int meson_g12b_dvfs_setup(struct platform_device *pdev)
 	/* Setup clock notifier for cpu_clk mux */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12b_cpu_clk.hw,
 					   DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12a_cpu_clk_mux_nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpu_clk notifier\n");
 		return ret;
@@ -5216,8 +5218,8 @@ static int meson_g12b_dvfs_setup(struct platform_device *pdev)
 	/* Setup clock notifier for sys1_pll */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12b_sys1_pll.hw,
 					   DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk,
-				    &g12b_cpu_clk_sys1_pll_nb_data.nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12b_cpu_clk_sys1_pll_nb_data.nb);
 	if (ret) {
 		dev_err(dev, "failed to register the sys1_pll notifier\n");
 		return ret;
@@ -5229,8 +5231,8 @@ static int meson_g12b_dvfs_setup(struct platform_device *pdev)
 	g12b_cpub_clk_postmux0_nb_data.xtal = xtal;
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12b_cpub_clk_postmux0.hw,
 					   DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk,
-				    &g12b_cpub_clk_postmux0_nb_data.nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12b_cpub_clk_postmux0_nb_data.nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpub_clk_postmux0 notifier\n");
 		return ret;
@@ -5238,7 +5240,8 @@ static int meson_g12b_dvfs_setup(struct platform_device *pdev)
 
 	/* Setup clock notifier for cpub_clk_dyn mux */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12b_cpub_clk_dyn.hw, "dvfs");
-	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12a_cpu_clk_mux_nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpub_clk_dyn notifier\n");
 		return ret;
@@ -5246,7 +5249,8 @@ static int meson_g12b_dvfs_setup(struct platform_device *pdev)
 
 	/* Setup clock notifier for cpub_clk mux */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12b_cpub_clk.hw, DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12a_cpu_clk_mux_nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpub_clk notifier\n");
 		return ret;
@@ -5254,8 +5258,8 @@ static int meson_g12b_dvfs_setup(struct platform_device *pdev)
 
 	/* Setup clock notifier for sys_pll */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12a_sys_pll.hw, DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk,
-				    &g12b_cpub_clk_sys_pll_nb_data.nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12b_cpub_clk_sys_pll_nb_data.nb);
 	if (ret) {
 		dev_err(dev, "failed to register the sys_pll notifier\n");
 		return ret;
@@ -5277,7 +5281,8 @@ static int meson_g12a_dvfs_setup(struct platform_device *pdev)
 
 	/* Setup clock notifier for cpu_clk mux */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12a_cpu_clk.hw, DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk, &g12a_cpu_clk_mux_nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+				    &g12a_cpu_clk_mux_nb);
 	if (ret) {
 		dev_err(dev, "failed to register the cpu_clk notifier\n");
 		return ret;
@@ -5285,7 +5290,8 @@ static int meson_g12a_dvfs_setup(struct platform_device *pdev)
 
 	/* Setup clock notifier for sys_pll */
 	notifier_clk = devm_clk_hw_get_clk(dev, &g12a_sys_pll.hw, DVFS_CON_ID);
-	ret = clk_notifier_register(notifier_clk, &g12a_sys_pll_nb_data.nb);
+	ret = devm_clk_notifier_register(dev, notifier_clk,
+					 &g12a_sys_pll_nb_data.nb);
 	if (ret) {
 		dev_err(dev, "failed to register the sys_pll notifier\n");
 		return ret;
diff --git a/include/linux/clk.h b/include/linux/clk.h
index f53afdf8198b..4ac766dc3daf 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -117,7 +117,8 @@ int clk_notifier_unregister(struct clk *clk, struct notifier_block *nb);
  *
  * Returns 0 on success, -EERROR otherwise
  */
-int devm_clk_notifier_register(struct device *dev, struct clk *clk, struct notifier_block *nb);
+int devm_clk_notifier_register(struct device *dev, struct clk *clk,
+			       struct notifier_block *nb);
 
 /**
  * clk_get_accuracy - obtain the clock accuracy in ppb (parts per billion)
@@ -196,6 +197,13 @@ static inline int clk_notifier_unregister(struct clk *clk,
 	return -ENOTSUPP;
 }
 
+static inline int devm_clk_notifier_register(struct device *dev,
+					     struct clk *clk,
+					     struct notifier_block *nb)
+{
+	return -ENOTSUPP;
+}
+
 static inline long clk_get_accuracy(struct clk *clk)
 {
 	return -ENOTSUPP;
-- 
cgit v1.2.3


From c4d51a52c67a1e3a0fa3006e5ec21cdc07649cd6 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 27 Oct 2020 14:39:43 +0000
Subject: sched/wait: Add add_wait_queue_priority()

This allows an exclusive wait_queue_entry to be added at the head of the
queue, instead of the tail as normal. Thus, it gets to consume events
first without allowing non-exclusive waiters to be woken at all.

The (first) intended use is for KVM IRQFD, which currently has
inconsistent behaviour depending on whether posted interrupts are
available or not. If they are, KVM will bypass the eventfd completely
and deliver interrupts directly to the appropriate vCPU. If not, events
are delivered through the eventfd and userspace will receive them when
polling on the eventfd.

By using add_wait_queue_priority(), KVM will be able to consistently
consume events within the kernel without accidentally exposing them
to userspace when they're supposed to be bypassed. This, in turn, means
that userspace doesn't have to jump through hoops to avoid listening
on the erroneously noisy eventfd and injecting duplicate interrupts.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20201027143944.648769-2-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/wait.h | 12 +++++++++++-
 kernel/sched/wait.c  | 17 ++++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 27fb99cfeb02..fe10e8570a52 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -22,6 +22,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
 #define WQ_FLAG_BOOKMARK	0x04
 #define WQ_FLAG_CUSTOM		0x08
 #define WQ_FLAG_DONE		0x10
+#define WQ_FLAG_PRIORITY	0x20
 
 /*
  * A single wait-queue entry structure:
@@ -164,11 +165,20 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
 
 extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
+extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
 
 static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
-	list_add(&wq_entry->entry, &wq_head->head);
+	struct list_head *head = &wq_head->head;
+	struct wait_queue_entry *wq;
+
+	list_for_each_entry(wq, &wq_head->head, entry) {
+		if (!(wq->flags & WQ_FLAG_PRIORITY))
+			break;
+		head = &wq->entry;
+	}
+	list_add(&wq_entry->entry, head);
 }
 
 /*
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 01f5d3020589..183cc6ae68a6 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -37,6 +37,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue
 }
 EXPORT_SYMBOL(add_wait_queue_exclusive);
 
+void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+	unsigned long flags;
+
+	wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+	spin_lock_irqsave(&wq_head->lock, flags);
+	__add_wait_queue(wq_head, wq_entry);
+	spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+
 void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
 {
 	unsigned long flags;
@@ -57,7 +68,11 @@ EXPORT_SYMBOL(remove_wait_queue);
 /*
  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
  * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
+ * number) then we wake that number of exclusive tasks, and potentially all
+ * the non-exclusive tasks. Normally, exclusive tasks will be at the end of
+ * the list and any non-exclusive tasks will be woken first. A priority task
+ * may be at the head of the list, and can consume the event without any other
+ * tasks being woken.
  *
  * There are circumstances in which we can try to wake a task which has already
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
-- 
cgit v1.2.3


From 28f1326710555bbe666f64452d08f2d7dd657cae Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Tue, 27 Oct 2020 13:55:21 +0000
Subject: eventfd: Export eventfd_ctx_do_read()

Where events are consumed in the kernel, for example by KVM's
irqfd_wakeup() and VFIO's virqfd_wakeup(), they currently lack a
mechanism to drain the eventfd's counter.

Since the wait queue is already locked while the wakeup functions are
invoked, all they really need to do is call eventfd_ctx_do_read().

Add a check for the lock, and export it for them.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Message-Id: <20201027135523.646811-2-dwmw2@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 fs/eventfd.c            | 5 ++++-
 include/linux/eventfd.h | 6 ++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index df466ef81ddd..e265b6dd4f34 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -182,11 +182,14 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
 	return events;
 }
 
-static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
 {
+	lockdep_assert_held(&ctx->wqh.lock);
+
 	*cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
 	ctx->count -= *cnt;
 }
+EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
 
 /**
  * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index dc4fd8a6644d..fa0a524baed0 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -41,6 +41,7 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
+void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
 
 DECLARE_PER_CPU(int, eventfd_wake_count);
 
@@ -82,6 +83,11 @@ static inline bool eventfd_signal_count(void)
 	return false;
 }
 
+static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+{
+
+}
+
 #endif
 
 #endif /* _LINUX_EVENTFD_H */
-- 
cgit v1.2.3


From 2f5414423ef577e9e8bdb227f32d0abdd34e4274 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 6 Nov 2020 05:25:09 -0500
Subject: KVM: remove kvm_clear_guest_page

kvm_clear_guest_page is not used anymore after "KVM: X86: Don't track dirty
for KVM_SET_[TSS_ADDR|IDENTITY_MAP_ADDR]", except from kvm_clear_guest.
We can just inline it in its sole user.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  1 -
 virt/kvm/kvm_main.c      | 11 ++---------
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7f2e2a09ebbd..66a4324f329d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -792,7 +792,6 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 			offset_in_page(__gpa), v);			\
 })
 
-int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2541a17ff1c4..1c7514579861 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2616,23 +2616,16 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
 
-int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
-{
-	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
-
-	return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
-}
-EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
-
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 {
+	const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	int seg;
 	int offset = offset_in_page(gpa);
 	int ret;
 
 	while ((seg = next_segment(len, offset)) != 0) {
-		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
+		ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
 		if (ret < 0)
 			return ret;
 		offset = 0;
-- 
cgit v1.2.3


From 28bd726aa404c0da8fd6852fe69bb4538a103b71 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 30 Sep 2020 21:20:34 -0400
Subject: KVM: Pass in kvm pointer into mark_page_dirty_in_slot()

The context will be needed to implement the kvm dirty ring.

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20201001012044.5151-5-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_host.h |  2 +-
 virt/kvm/kvm_main.c      | 30 +++++++++++++++++-------------
 2 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 66a4324f329d..ca7c1459a8e3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -797,7 +797,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn);
-void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
+void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t gfn);
 void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
 
 struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1c7514579861..68598fdba226 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2196,7 +2196,8 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
 
-static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
+static void __kvm_unmap_gfn(struct kvm *kvm,
+			struct kvm_memory_slot *memslot,
 			struct kvm_host_map *map,
 			struct gfn_to_pfn_cache *cache,
 			bool dirty, bool atomic)
@@ -2221,7 +2222,7 @@ static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
 #endif
 
 	if (dirty)
-		mark_page_dirty_in_slot(memslot, map->gfn);
+		mark_page_dirty_in_slot(kvm, memslot, map->gfn);
 
 	if (cache)
 		cache->dirty |= dirty;
@@ -2235,7 +2236,7 @@ static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
 int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 
 		  struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
 {
-	__kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map,
+	__kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map,
 			cache, dirty, atomic);
 	return 0;
 }
@@ -2243,8 +2244,8 @@ EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
 
 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
 {
-	__kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL,
-			dirty, false);
+	__kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn),
+			map, NULL, dirty, false);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
 
@@ -2418,7 +2419,8 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
 
-static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
+static int __kvm_write_guest_page(struct kvm *kvm,
+				  struct kvm_memory_slot *memslot, gfn_t gfn,
 			          const void *data, int offset, int len)
 {
 	int r;
@@ -2430,7 +2432,7 @@ static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
 	r = __copy_to_user((void __user *)addr + offset, data, len);
 	if (r)
 		return -EFAULT;
-	mark_page_dirty_in_slot(memslot, gfn);
+	mark_page_dirty_in_slot(kvm, memslot, gfn);
 	return 0;
 }
 
@@ -2439,7 +2441,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
 {
 	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
 
-	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+	return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
 
@@ -2448,7 +2450,7 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
 {
 	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 
-	return __kvm_write_guest_page(slot, gfn, data, offset, len);
+	return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
 
@@ -2567,7 +2569,7 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 	r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
 	if (r)
 		return -EFAULT;
-	mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
+	mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
 
 	return 0;
 }
@@ -2636,7 +2638,9 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
 }
 EXPORT_SYMBOL_GPL(kvm_clear_guest);
 
-void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn)
+void mark_page_dirty_in_slot(struct kvm *kvm,
+			     struct kvm_memory_slot *memslot,
+		 	     gfn_t gfn)
 {
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
@@ -2651,7 +2655,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 	struct kvm_memory_slot *memslot;
 
 	memslot = gfn_to_memslot(kvm, gfn);
-	mark_page_dirty_in_slot(memslot, gfn);
+	mark_page_dirty_in_slot(kvm, memslot, gfn);
 }
 EXPORT_SYMBOL_GPL(mark_page_dirty);
 
@@ -2660,7 +2664,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
 	struct kvm_memory_slot *memslot;
 
 	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-	mark_page_dirty_in_slot(memslot, gfn);
+	mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
-- 
cgit v1.2.3


From fb04a1eddb1a65b6588a021bdc132270d5ae48bb Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 30 Sep 2020 21:22:22 -0400
Subject: KVM: X86: Implement ring-based dirty memory tracking

This patch is heavily based on previous work from Lei Cao
<lei.cao@stratus.com> and Paolo Bonzini <pbonzini@redhat.com>. [1]

KVM currently uses large bitmaps to track dirty memory.  These bitmaps
are copied to userspace when userspace queries KVM for its dirty page
information.  The use of bitmaps is mostly sufficient for live
migration, as large parts of memory are be dirtied from one log-dirty
pass to another.  However, in a checkpointing system, the number of
dirty pages is small and in fact it is often bounded---the VM is
paused when it has dirtied a pre-defined number of pages. Traversing a
large, sparsely populated bitmap to find set bits is time-consuming,
as is copying the bitmap to user-space.

A similar issue will be there for live migration when the guest memory
is huge while the page dirty procedure is trivial.  In that case for
each dirty sync we need to pull the whole dirty bitmap to userspace
and analyse every bit even if it's mostly zeros.

The preferred data structure for above scenarios is a dense list of
guest frame numbers (GFN).  This patch series stores the dirty list in
kernel memory that can be memory mapped into userspace to allow speedy
harvesting.

This patch enables dirty ring for X86 only.  However it should be
easily extended to other archs as well.

[1] https://patchwork.kernel.org/patch/10471409/

Signed-off-by: Lei Cao <lei.cao@stratus.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20201001012222.5767-1-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virt/kvm/api.rst  |  93 +++++++++++++++++++
 arch/x86/include/asm/kvm_host.h |   3 +
 arch/x86/include/uapi/asm/kvm.h |   1 +
 arch/x86/kvm/Makefile           |   3 +-
 arch/x86/kvm/mmu/mmu.c          |   8 ++
 arch/x86/kvm/mmu/tdp_mmu.c      |   2 +-
 arch/x86/kvm/vmx/vmx.c          |   7 ++
 arch/x86/kvm/x86.c              |   9 ++
 include/linux/kvm_dirty_ring.h  | 103 +++++++++++++++++++++
 include/linux/kvm_host.h        |  13 +++
 include/trace/events/kvm.h      |  63 +++++++++++++
 include/uapi/linux/kvm.h        |  53 +++++++++++
 virt/kvm/dirty_ring.c           | 194 ++++++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c             | 113 ++++++++++++++++++++++-
 14 files changed, 662 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/kvm_dirty_ring.h
 create mode 100644 virt/kvm/dirty_ring.c

(limited to 'include/linux')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 81d54fe76a2d..e264ebc35e27 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -262,6 +262,18 @@ The KVM_RUN ioctl (cf.) communicates with userspace via a shared
 memory region.  This ioctl returns the size of that region.  See the
 KVM_RUN documentation for details.
 
+Besides the size of the KVM_RUN communication region, other areas of
+the VCPU file descriptor can be mmap-ed, including:
+
+- if KVM_CAP_COALESCED_MMIO is available, a page at
+  KVM_COALESCED_MMIO_PAGE_OFFSET * PAGE_SIZE; for historical reasons,
+  this page is included in the result of KVM_GET_VCPU_MMAP_SIZE.
+  KVM_CAP_COALESCED_MMIO is not documented yet.
+
+- if KVM_CAP_DIRTY_LOG_RING is available, a number of pages at
+  KVM_DIRTY_LOG_PAGE_OFFSET * PAGE_SIZE.  For more information on
+  KVM_CAP_DIRTY_LOG_RING, see section 8.3.
+
 
 4.6 KVM_SET_MEMORY_REGION
 -------------------------
@@ -6396,3 +6408,84 @@ When enabled, KVM will disable paravirtual features provided to the
 guest according to the bits in the KVM_CPUID_FEATURES CPUID leaf
 (0x40000001). Otherwise, a guest may use the paravirtual features
 regardless of what has actually been exposed through the CPUID leaf.
+
+
+8.29 KVM_CAP_DIRTY_LOG_RING
+---------------------------
+
+:Architectures: x86
+:Parameters: args[0] - size of the dirty log ring
+
+KVM is capable of tracking dirty memory using ring buffers that are
+mmaped into userspace; there is one dirty ring per vcpu.
+
+The dirty ring is available to userspace as an array of
+``struct kvm_dirty_gfn``.  Each dirty entry it's defined as::
+
+  struct kvm_dirty_gfn {
+          __u32 flags;
+          __u32 slot; /* as_id | slot_id */
+          __u64 offset;
+  };
+
+The following values are defined for the flags field to define the
+current state of the entry::
+
+  #define KVM_DIRTY_GFN_F_DIRTY           BIT(0)
+  #define KVM_DIRTY_GFN_F_RESET           BIT(1)
+  #define KVM_DIRTY_GFN_F_MASK            0x3
+
+Userspace should call KVM_ENABLE_CAP ioctl right after KVM_CREATE_VM
+ioctl to enable this capability for the new guest and set the size of
+the rings.  Enabling the capability is only allowed before creating any
+vCPU, and the size of the ring must be a power of two.  The larger the
+ring buffer, the less likely the ring is full and the VM is forced to
+exit to userspace. The optimal size depends on the workload, but it is
+recommended that it be at least 64 KiB (4096 entries).
+
+Just like for dirty page bitmaps, the buffer tracks writes to
+all user memory regions for which the KVM_MEM_LOG_DIRTY_PAGES flag was
+set in KVM_SET_USER_MEMORY_REGION.  Once a memory region is registered
+with the flag set, userspace can start harvesting dirty pages from the
+ring buffer.
+
+An entry in the ring buffer can be unused (flag bits ``00``),
+dirty (flag bits ``01``) or harvested (flag bits ``1X``).  The
+state machine for the entry is as follows::
+
+          dirtied         harvested        reset
+     00 -----------> 01 -------------> 1X -------+
+      ^                                          |
+      |                                          |
+      +------------------------------------------+
+
+To harvest the dirty pages, userspace accesses the mmaped ring buffer
+to read the dirty GFNs.  If the flags has the DIRTY bit set (at this stage
+the RESET bit must be cleared), then it means this GFN is a dirty GFN.
+The userspace should harvest this GFN and mark the flags from state
+``01b`` to ``1Xb`` (bit 0 will be ignored by KVM, but bit 1 must be set
+to show that this GFN is harvested and waiting for a reset), and move
+on to the next GFN.  The userspace should continue to do this until the
+flags of a GFN have the DIRTY bit cleared, meaning that it has harvested
+all the dirty GFNs that were available.
+
+It's not necessary for userspace to harvest the all dirty GFNs at once.
+However it must collect the dirty GFNs in sequence, i.e., the userspace
+program cannot skip one dirty GFN to collect the one next to it.
+
+After processing one or more entries in the ring buffer, userspace
+calls the VM ioctl KVM_RESET_DIRTY_RINGS to notify the kernel about
+it, so that the kernel will reprotect those collected GFNs.
+Therefore, the ioctl must be called *before* reading the content of
+the dirty pages.
+
+The dirty ring can get full.  When it happens, the KVM_RUN of the
+vcpu will return with exit reason KVM_EXIT_DIRTY_LOG_FULL.
+
+The dirty ring interface has a major difference comparing to the
+KVM_GET_DIRTY_LOG interface in that, when reading the dirty ring from
+userspace, it's still possible that the kernel has not yet flushed the
+processor's dirty page buffers into the kernel buffer (with dirty bitmaps, the
+flushing is done by the KVM_GET_DIRTY_LOG ioctl).  To achieve that, one
+needs to kick the vcpu out of KVM_RUN using a signal.  The resulting
+vmexit ensures that all dirty GFNs are flushed to the dirty rings.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 69e94aa716e9..f002cdb13a0b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1232,6 +1232,7 @@ struct kvm_x86_ops {
 	void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
 					   struct kvm_memory_slot *slot,
 					   gfn_t offset, unsigned long mask);
+	int (*cpu_dirty_log_size)(void);
 
 	/* pmu operations of sub-arch */
 	const struct kvm_pmu_ops *pmu_ops;
@@ -1744,4 +1745,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
 #define GET_SMSTATE(type, buf, offset)		\
 	(*(type *)((buf) + (offset) - 0x7e00))
 
+int kvm_cpu_dirty_log_size(void);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 89e5f3d1bba8..8e76d3701db3 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -12,6 +12,7 @@
 
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define KVM_DIRTY_LOG_PAGE_OFFSET 64
 
 #define DE_VECTOR 0
 #define DB_VECTOR 1
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b804444e16d4..4bd14ab01323 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -10,7 +10,8 @@ endif
 KVM := ../../../virt/kvm
 
 kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
+				$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o \
+				$(KVM)/dirty_ring.o
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5bb1939b65d8..12e5cfe0995e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1289,6 +1289,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
+int kvm_cpu_dirty_log_size(void)
+{
+	if (kvm_x86_ops.cpu_dirty_log_size)
+		return kvm_x86_ops.cpu_dirty_log_size();
+
+	return 0;
+}
+
 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
 				    struct kvm_memory_slot *slot, u64 gfn)
 {
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index ff28a5c6abd6..cffa51c6049e 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -185,7 +185,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
 	    is_writable_pte(new_spte)) {
 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
-		mark_page_dirty_in_slot(slot, gfn);
+		mark_page_dirty_in_slot(kvm, slot, gfn);
 	}
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 46b32aa43811..2b6d538454a6 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7583,6 +7583,11 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
 	return supported & BIT(bit);
 }
 
+static int vmx_cpu_dirty_log_size(void)
+{
+	return enable_pml ? PML_ENTITY_NUM : 0;
+}
+
 static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.hardware_unsetup = hardware_unsetup,
 
@@ -7712,6 +7717,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
 	.migrate_timers = vmx_migrate_timers,
 
 	.msr_filter_changed = vmx_msr_filter_changed,
+	.cpu_dirty_log_size = vmx_cpu_dirty_log_size,
 };
 
 static __init int hardware_setup(void)
@@ -7829,6 +7835,7 @@ static __init int hardware_setup(void)
 		vmx_x86_ops.slot_disable_log_dirty = NULL;
 		vmx_x86_ops.flush_log_dirty = NULL;
 		vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
+		vmx_x86_ops.cpu_dirty_log_size = NULL;
 	}
 
 	if (!cpu_has_vmx_preemption_timer())
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4ac726526f8..6c704a597b7c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8754,6 +8754,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	bool req_immediate_exit = false;
 
+	/* Forbid vmenter if vcpu dirty ring is soft-full */
+	if (unlikely(vcpu->kvm->dirty_ring_size &&
+		     kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) {
+		vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
+		trace_kvm_dirty_ring_exit(vcpu);
+		r = 0;
+		goto out;
+	}
+
 	if (kvm_request_pending(vcpu)) {
 		if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
 			if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h
new file mode 100644
index 000000000000..120e5e90fa1d
--- /dev/null
+++ b/include/linux/kvm_dirty_ring.h
@@ -0,0 +1,103 @@
+#ifndef KVM_DIRTY_RING_H
+#define KVM_DIRTY_RING_H
+
+#include <linux/kvm.h>
+
+/**
+ * kvm_dirty_ring: KVM internal dirty ring structure
+ *
+ * @dirty_index: free running counter that points to the next slot in
+ *               dirty_ring->dirty_gfns, where a new dirty page should go
+ * @reset_index: free running counter that points to the next dirty page
+ *               in dirty_ring->dirty_gfns for which dirty trap needs to
+ *               be reenabled
+ * @size:        size of the compact list, dirty_ring->dirty_gfns
+ * @soft_limit:  when the number of dirty pages in the list reaches this
+ *               limit, vcpu that owns this ring should exit to userspace
+ *               to allow userspace to harvest all the dirty pages
+ * @dirty_gfns:  the array to keep the dirty gfns
+ * @index:       index of this dirty ring
+ */
+struct kvm_dirty_ring {
+	u32 dirty_index;
+	u32 reset_index;
+	u32 size;
+	u32 soft_limit;
+	struct kvm_dirty_gfn *dirty_gfns;
+	int index;
+};
+
+#if (KVM_DIRTY_LOG_PAGE_OFFSET == 0)
+/*
+ * If KVM_DIRTY_LOG_PAGE_OFFSET not defined, kvm_dirty_ring.o should
+ * not be included as well, so define these nop functions for the arch.
+ */
+static inline u32 kvm_dirty_ring_get_rsvd_entries(void)
+{
+	return 0;
+}
+
+static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring,
+				       int index, u32 size)
+{
+	return 0;
+}
+
+static inline struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm)
+{
+	return NULL;
+}
+
+static inline int kvm_dirty_ring_reset(struct kvm *kvm,
+				       struct kvm_dirty_ring *ring)
+{
+	return 0;
+}
+
+static inline void kvm_dirty_ring_push(struct kvm_dirty_ring *ring,
+				       u32 slot, u64 offset)
+{
+}
+
+static inline struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring,
+						   u32 offset)
+{
+	return NULL;
+}
+
+static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring)
+{
+}
+
+static inline bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
+{
+	return true;
+}
+
+#else /* KVM_DIRTY_LOG_PAGE_OFFSET == 0 */
+
+u32 kvm_dirty_ring_get_rsvd_entries(void);
+int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size);
+struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm);
+
+/*
+ * called with kvm->slots_lock held, returns the number of
+ * processed pages.
+ */
+int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring);
+
+/*
+ * returns =0: successfully pushed
+ *         <0: unable to push, need to wait
+ */
+void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset);
+
+/* for use in vm_operations_struct */
+struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset);
+
+void kvm_dirty_ring_free(struct kvm_dirty_ring *ring);
+bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring);
+
+#endif /* KVM_DIRTY_LOG_PAGE_OFFSET == 0 */
+
+#endif	/* KVM_DIRTY_RING_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ca7c1459a8e3..864b156391c8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -34,6 +34,7 @@
 #include <linux/kvm_types.h>
 
 #include <asm/kvm_host.h>
+#include <linux/kvm_dirty_ring.h>
 
 #ifndef KVM_MAX_VCPU_ID
 #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
@@ -319,6 +320,7 @@ struct kvm_vcpu {
 	bool preempted;
 	bool ready;
 	struct kvm_vcpu_arch arch;
+	struct kvm_dirty_ring dirty_ring;
 };
 
 static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
@@ -505,6 +507,7 @@ struct kvm {
 	struct srcu_struct irq_srcu;
 	pid_t userspace_pid;
 	unsigned int max_halt_poll_ns;
+	u32 dirty_ring_size;
 };
 
 #define kvm_err(fmt, ...) \
@@ -1477,4 +1480,14 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
 
+/*
+ * This defines how many reserved entries we want to keep before we
+ * kick the vcpu to the userspace to avoid dirty ring full.  This
+ * value can be tuned to higher if e.g. PML is enabled on the host.
+ */
+#define  KVM_DIRTY_RING_RSVD_ENTRIES  64
+
+/* Max number of entries allowed for each kvm dirty ring */
+#define  KVM_DIRTY_RING_MAX_ENTRIES  65536
+
 #endif
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 26cfb0fa8e7e..49d7d0fe29f6 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -399,6 +399,69 @@ TRACE_EVENT(kvm_halt_poll_ns,
 #define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \
 	trace_kvm_halt_poll_ns(false, vcpu_id, new, old)
 
+TRACE_EVENT(kvm_dirty_ring_push,
+	TP_PROTO(struct kvm_dirty_ring *ring, u32 slot, u64 offset),
+	TP_ARGS(ring, slot, offset),
+
+	TP_STRUCT__entry(
+		__field(int, index)
+		__field(u32, dirty_index)
+		__field(u32, reset_index)
+		__field(u32, slot)
+		__field(u64, offset)
+	),
+
+	TP_fast_assign(
+		__entry->index          = ring->index;
+		__entry->dirty_index    = ring->dirty_index;
+		__entry->reset_index    = ring->reset_index;
+		__entry->slot           = slot;
+		__entry->offset         = offset;
+	),
+
+	TP_printk("ring %d: dirty 0x%x reset 0x%x "
+		  "slot %u offset 0x%llx (used %u)",
+		  __entry->index, __entry->dirty_index,
+		  __entry->reset_index,  __entry->slot, __entry->offset,
+		  __entry->dirty_index - __entry->reset_index)
+);
+
+TRACE_EVENT(kvm_dirty_ring_reset,
+	TP_PROTO(struct kvm_dirty_ring *ring),
+	TP_ARGS(ring),
+
+	TP_STRUCT__entry(
+		__field(int, index)
+		__field(u32, dirty_index)
+		__field(u32, reset_index)
+	),
+
+	TP_fast_assign(
+		__entry->index          = ring->index;
+		__entry->dirty_index    = ring->dirty_index;
+		__entry->reset_index    = ring->reset_index;
+	),
+
+	TP_printk("ring %d: dirty 0x%x reset 0x%x (used %u)",
+		  __entry->index, __entry->dirty_index, __entry->reset_index,
+		  __entry->dirty_index - __entry->reset_index)
+);
+
+TRACE_EVENT(kvm_dirty_ring_exit,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+	    __field(int, vcpu_id)
+	),
+
+	TP_fast_assign(
+	    __entry->vcpu_id = vcpu->vcpu_id;
+	),
+
+	TP_printk("vcpu %d", __entry->vcpu_id)
+);
+
 #endif /* _TRACE_KVM_MAIN_H */
 
 /* This part must be outside protection */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 204afbe1240e..886802b8ffba 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -250,6 +250,7 @@ struct kvm_hyperv_exit {
 #define KVM_EXIT_ARM_NISV         28
 #define KVM_EXIT_X86_RDMSR        29
 #define KVM_EXIT_X86_WRMSR        30
+#define KVM_EXIT_DIRTY_RING_FULL  31
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -1054,6 +1055,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_X86_MSR_FILTER 189
 #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190
 #define KVM_CAP_SYS_HYPERV_CPUID 191
+#define KVM_CAP_DIRTY_LOG_RING 192
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -1558,6 +1560,9 @@ struct kvm_pv_cmd {
 /* Available with KVM_CAP_X86_MSR_FILTER */
 #define KVM_X86_SET_MSR_FILTER	_IOW(KVMIO,  0xc6, struct kvm_msr_filter)
 
+/* Available with KVM_CAP_DIRTY_LOG_RING */
+#define KVM_RESET_DIRTY_RINGS		_IO(KVMIO, 0xc7)
+
 /* Secure Encrypted Virtualization command */
 enum sev_cmd_id {
 	/* Guest initialization commands */
@@ -1711,4 +1716,52 @@ struct kvm_hyperv_eventfd {
 #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE    (1 << 0)
 #define KVM_DIRTY_LOG_INITIALLY_SET            (1 << 1)
 
+/*
+ * Arch needs to define the macro after implementing the dirty ring
+ * feature.  KVM_DIRTY_LOG_PAGE_OFFSET should be defined as the
+ * starting page offset of the dirty ring structures.
+ */
+#ifndef KVM_DIRTY_LOG_PAGE_OFFSET
+#define KVM_DIRTY_LOG_PAGE_OFFSET 0
+#endif
+
+/*
+ * KVM dirty GFN flags, defined as:
+ *
+ * |---------------+---------------+--------------|
+ * | bit 1 (reset) | bit 0 (dirty) | Status       |
+ * |---------------+---------------+--------------|
+ * |             0 |             0 | Invalid GFN  |
+ * |             0 |             1 | Dirty GFN    |
+ * |             1 |             X | GFN to reset |
+ * |---------------+---------------+--------------|
+ *
+ * Lifecycle of a dirty GFN goes like:
+ *
+ *      dirtied         harvested        reset
+ * 00 -----------> 01 -------------> 1X -------+
+ *  ^                                          |
+ *  |                                          |
+ *  +------------------------------------------+
+ *
+ * The userspace program is only responsible for the 01->1X state
+ * conversion after harvesting an entry.  Also, it must not skip any
+ * dirty bits, so that dirty bits are always harvested in sequence.
+ */
+#define KVM_DIRTY_GFN_F_DIRTY           BIT(0)
+#define KVM_DIRTY_GFN_F_RESET           BIT(1)
+#define KVM_DIRTY_GFN_F_MASK            0x3
+
+/*
+ * KVM dirty rings should be mapped at KVM_DIRTY_LOG_PAGE_OFFSET of
+ * per-vcpu mmaped regions as an array of struct kvm_dirty_gfn.  The
+ * size of the gfn buffer is decided by the first argument when
+ * enabling KVM_CAP_DIRTY_LOG_RING.
+ */
+struct kvm_dirty_gfn {
+	__u32 flags;
+	__u32 slot;
+	__u64 offset;
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c
new file mode 100644
index 000000000000..9d01299563ee
--- /dev/null
+++ b/virt/kvm/dirty_ring.c
@@ -0,0 +1,194 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM dirty ring implementation
+ *
+ * Copyright 2019 Red Hat, Inc.
+ */
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/vmalloc.h>
+#include <linux/kvm_dirty_ring.h>
+#include <trace/events/kvm.h>
+
+int __weak kvm_cpu_dirty_log_size(void)
+{
+	return 0;
+}
+
+u32 kvm_dirty_ring_get_rsvd_entries(void)
+{
+	return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size();
+}
+
+static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring)
+{
+	return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index);
+}
+
+bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
+{
+	return kvm_dirty_ring_used(ring) >= ring->soft_limit;
+}
+
+static bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring)
+{
+	return kvm_dirty_ring_used(ring) >= ring->size;
+}
+
+struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+	WARN_ON_ONCE(vcpu->kvm != kvm);
+
+	return &vcpu->dirty_ring;
+}
+
+static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
+{
+	struct kvm_memory_slot *memslot;
+	int as_id, id;
+
+	as_id = slot >> 16;
+	id = (u16)slot;
+
+	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
+		return;
+
+	memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
+
+	if (!memslot || (offset + __fls(mask)) >= memslot->npages)
+		return;
+
+	spin_lock(&kvm->mmu_lock);
+	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
+	spin_unlock(&kvm->mmu_lock);
+}
+
+int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size)
+{
+	ring->dirty_gfns = vmalloc(size);
+	if (!ring->dirty_gfns)
+		return -ENOMEM;
+	memset(ring->dirty_gfns, 0, size);
+
+	ring->size = size / sizeof(struct kvm_dirty_gfn);
+	ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries();
+	ring->dirty_index = 0;
+	ring->reset_index = 0;
+	ring->index = index;
+
+	return 0;
+}
+
+static inline void kvm_dirty_gfn_set_invalid(struct kvm_dirty_gfn *gfn)
+{
+	gfn->flags = 0;
+}
+
+static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn)
+{
+	gfn->flags = KVM_DIRTY_GFN_F_DIRTY;
+}
+
+static inline bool kvm_dirty_gfn_invalid(struct kvm_dirty_gfn *gfn)
+{
+	return gfn->flags == 0;
+}
+
+static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn)
+{
+	return gfn->flags & KVM_DIRTY_GFN_F_RESET;
+}
+
+int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
+{
+	u32 cur_slot, next_slot;
+	u64 cur_offset, next_offset;
+	unsigned long mask;
+	int count = 0;
+	struct kvm_dirty_gfn *entry;
+	bool first_round = true;
+
+	/* This is only needed to make compilers happy */
+	cur_slot = cur_offset = mask = 0;
+
+	while (true) {
+		entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)];
+
+		if (!kvm_dirty_gfn_harvested(entry))
+			break;
+
+		next_slot = READ_ONCE(entry->slot);
+		next_offset = READ_ONCE(entry->offset);
+
+		/* Update the flags to reflect that this GFN is reset */
+		kvm_dirty_gfn_set_invalid(entry);
+
+		ring->reset_index++;
+		count++;
+		/*
+		 * Try to coalesce the reset operations when the guest is
+		 * scanning pages in the same slot.
+		 */
+		if (!first_round && next_slot == cur_slot) {
+			s64 delta = next_offset - cur_offset;
+
+			if (delta >= 0 && delta < BITS_PER_LONG) {
+				mask |= 1ull << delta;
+				continue;
+			}
+
+			/* Backwards visit, careful about overflows!  */
+			if (delta > -BITS_PER_LONG && delta < 0 &&
+			    (mask << -delta >> -delta) == mask) {
+				cur_offset = next_offset;
+				mask = (mask << -delta) | 1;
+				continue;
+			}
+		}
+		kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+		cur_slot = next_slot;
+		cur_offset = next_offset;
+		mask = 1;
+		first_round = false;
+	}
+
+	kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
+
+	trace_kvm_dirty_ring_reset(ring);
+
+	return count;
+}
+
+void kvm_dirty_ring_push(struct kvm_dirty_ring *ring, u32 slot, u64 offset)
+{
+	struct kvm_dirty_gfn *entry;
+
+	/* It should never get full */
+	WARN_ON_ONCE(kvm_dirty_ring_full(ring));
+
+	entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)];
+
+	entry->slot = slot;
+	entry->offset = offset;
+	/*
+	 * Make sure the data is filled in before we publish this to
+	 * the userspace program.  There's no paired kernel-side reader.
+	 */
+	smp_wmb();
+	kvm_dirty_gfn_set_dirtied(entry);
+	ring->dirty_index++;
+	trace_kvm_dirty_ring_push(ring, slot, offset);
+}
+
+struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset)
+{
+	return vmalloc_to_page((void *)ring->dirty_gfns + offset * PAGE_SIZE);
+}
+
+void kvm_dirty_ring_free(struct kvm_dirty_ring *ring)
+{
+	vfree(ring->dirty_gfns);
+	ring->dirty_gfns = NULL;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 68598fdba226..78ef414512bf 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -63,6 +63,8 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
 
+#include <linux/kvm_dirty_ring.h>
+
 /* Worst case buffer size needed for holding an integer. */
 #define ITOA_MAX_LEN 12
 
@@ -415,6 +417,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 
 void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+	kvm_dirty_ring_free(&vcpu->dirty_ring);
 	kvm_arch_vcpu_destroy(vcpu);
 
 	/*
@@ -2644,8 +2647,13 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
 {
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
+		u32 slot = (memslot->as_id << 16) | memslot->id;
 
-		set_bit_le(rel_gfn, memslot->dirty_bitmap);
+		if (kvm->dirty_ring_size)
+			kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
+					    slot, rel_gfn);
+		else
+			set_bit_le(rel_gfn, memslot->dirty_bitmap);
 	}
 }
 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
@@ -3005,6 +3013,17 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
+static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
+{
+#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
+	return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
+	    (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
+	     kvm->dirty_ring_size / PAGE_SIZE);
+#else
+	return false;
+#endif
+}
+
 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
 {
 	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
@@ -3020,6 +3039,10 @@ static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
 	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
 		page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
+	else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
+		page = kvm_dirty_ring_get_page(
+		    &vcpu->dirty_ring,
+		    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
 	else
 		return kvm_arch_vcpu_fault(vcpu, vmf);
 	get_page(page);
@@ -3033,6 +3056,14 @@ static const struct vm_operations_struct kvm_vcpu_vm_ops = {
 
 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct kvm_vcpu *vcpu = file->private_data;
+	unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+
+	if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
+	     kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
+	    ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
+		return -EINVAL;
+
 	vma->vm_ops = &kvm_vcpu_vm_ops;
 	return 0;
 }
@@ -3126,6 +3157,13 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 	if (r)
 		goto vcpu_free_run_page;
 
+	if (kvm->dirty_ring_size) {
+		r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
+					 id, kvm->dirty_ring_size);
+		if (r)
+			goto arch_vcpu_destroy;
+	}
+
 	mutex_lock(&kvm->lock);
 	if (kvm_get_vcpu_by_id(kvm, id)) {
 		r = -EEXIST;
@@ -3159,6 +3197,8 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 
 unlock_vcpu_destroy:
 	mutex_unlock(&kvm->lock);
+	kvm_dirty_ring_free(&vcpu->dirty_ring);
+arch_vcpu_destroy:
 	kvm_arch_vcpu_destroy(vcpu);
 vcpu_free_run_page:
 	free_page((unsigned long)vcpu->run);
@@ -3631,12 +3671,78 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 #endif
 	case KVM_CAP_NR_MEMSLOTS:
 		return KVM_USER_MEM_SLOTS;
+	case KVM_CAP_DIRTY_LOG_RING:
+#if KVM_DIRTY_LOG_PAGE_OFFSET > 0
+		return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
+#else
+		return 0;
+#endif
 	default:
 		break;
 	}
 	return kvm_vm_ioctl_check_extension(kvm, arg);
 }
 
+static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
+{
+	int r;
+
+	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
+		return -EINVAL;
+
+	/* the size should be power of 2 */
+	if (!size || (size & (size - 1)))
+		return -EINVAL;
+
+	/* Should be bigger to keep the reserved entries, or a page */
+	if (size < kvm_dirty_ring_get_rsvd_entries() *
+	    sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
+		return -EINVAL;
+
+	if (size > KVM_DIRTY_RING_MAX_ENTRIES *
+	    sizeof(struct kvm_dirty_gfn))
+		return -E2BIG;
+
+	/* We only allow it to set once */
+	if (kvm->dirty_ring_size)
+		return -EINVAL;
+
+	mutex_lock(&kvm->lock);
+
+	if (kvm->created_vcpus) {
+		/* We don't allow to change this value after vcpu created */
+		r = -EINVAL;
+	} else {
+		kvm->dirty_ring_size = size;
+		r = 0;
+	}
+
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+	int cleared = 0;
+
+	if (!kvm->dirty_ring_size)
+		return -EINVAL;
+
+	mutex_lock(&kvm->slots_lock);
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
+
+	mutex_unlock(&kvm->slots_lock);
+
+	if (cleared)
+		kvm_flush_remote_tlbs(kvm);
+
+	return cleared;
+}
+
 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 						  struct kvm_enable_cap *cap)
 {
@@ -3667,6 +3773,8 @@ static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
 		kvm->max_halt_poll_ns = cap->args[0];
 		return 0;
 	}
+	case KVM_CAP_DIRTY_LOG_RING:
+		return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
 	default:
 		return kvm_vm_ioctl_enable_cap(kvm, cap);
 	}
@@ -3851,6 +3959,9 @@ static long kvm_vm_ioctl(struct file *filp,
 	case KVM_CHECK_EXTENSION:
 		r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
 		break;
+	case KVM_RESET_DIRTY_RINGS:
+		r = kvm_vm_ioctl_reset_dirty_pages(kvm);
+		break;
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
-- 
cgit v1.2.3


From 044c59c409b7fd753707dc437890e94d2b0bd819 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 30 Sep 2020 21:22:26 -0400
Subject: KVM: Don't allocate dirty bitmap if dirty ring is enabled

Because kvm dirty rings and kvm dirty log is used in an exclusive way,
Let's avoid creating the dirty_bitmap when kvm dirty ring is enabled.
At the meantime, since the dirty_bitmap will be conditionally created
now, we can't use it as a sign of "whether this memory slot enabled
dirty tracking".  Change users like that to check against the kvm
memory slot flags.

Note that there still can be chances where the kvm memory slot got its
dirty_bitmap allocated, _if_ the memory slots are created before
enabling of the dirty rings and at the same time with the dirty
tracking capability enabled, they'll still with the dirty_bitmap.
However it should not hurt much (e.g., the bitmaps will always be
freed if they are there), and the real users normally won't trigger
this because dirty bit tracking flag should in most cases only be
applied to kvm slots only before migration starts, that should be far
latter than kvm initializes (VM starts).

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20201001012226.5868-1-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c   | 2 +-
 include/linux/kvm_host.h | 5 +++++
 virt/kvm/kvm_main.c      | 4 ++--
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 12e5cfe0995e..5dfe0ede0e81 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -820,7 +820,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return NULL;
-	if (no_dirty_log && slot->dirty_bitmap)
+	if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
 		return NULL;
 
 	return slot;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 864b156391c8..f3b1013fb22c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -351,6 +351,11 @@ struct kvm_memory_slot {
 	u16 as_id;
 };
 
+static inline bool kvm_slot_dirty_track_enabled(struct kvm_memory_slot *slot)
+{
+	return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
+}
+
 static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
 {
 	return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 110aa5cc0c93..3abcb2ce5b7d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1365,7 +1365,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	/* Allocate/free page dirty bitmap as needed */
 	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
 		new.dirty_bitmap = NULL;
-	else if (!new.dirty_bitmap) {
+	else if (!new.dirty_bitmap && !kvm->dirty_ring_size) {
 		r = kvm_alloc_dirty_bitmap(&new);
 		if (r)
 			return r;
@@ -2657,7 +2657,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm,
 			     struct kvm_memory_slot *memslot,
 		 	     gfn_t gfn)
 {
-	if (memslot && memslot->dirty_bitmap) {
+	if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 		u32 slot = (memslot->as_id << 16) | memslot->id;
 
-- 
cgit v1.2.3


From 23d89aa0c2192f2d4582198b381d8805492c7925 Mon Sep 17 00:00:00 2001
From: Daniel Baluta <daniel.baluta@nxp.com>
Date: Wed, 11 Nov 2020 13:11:18 +0200
Subject: firmware: imx-dsp: Export functions to request/free channels

In order to save power, we only need to request a channel
when the communication with the DSP active.

For this we export the following functions:
	- imx_dsp_request_channel, gets a channel with a given index
	- imx_dsp_free_channel, frees a channel with a given index

Notice that we still request channels at probe to support devices
that do not have PM callbacks implemented.

More explanations about why requesting a channel has an effect
on power savings:
 - requesting an mailbox channel will call mailbox's startup
   function.
 - startup function calls pm_runtime_get_sync which increments device
   usage count and will keep the device active. Specifically, mailbox
   clock will be always ON when a mailbox channel is requested.

Signed-off-by: Daniel Baluta <daniel.baluta@nxp.com>
Reviewed-by: Paul Olaru <paul.olaru@nxp.com>
Signed-off-by: Shawn Guo <shawnguo@kernel.org>
---
 drivers/firmware/imx/imx-dsp.c   | 25 +++++++++++++++++++++++++
 include/linux/firmware/imx/dsp.h | 10 ++++++++++
 2 files changed, 35 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/imx/imx-dsp.c b/drivers/firmware/imx/imx-dsp.c
index b6e95d6d34c0..a6c06d7476c3 100644
--- a/drivers/firmware/imx/imx-dsp.c
+++ b/drivers/firmware/imx/imx-dsp.c
@@ -60,6 +60,31 @@ static void imx_dsp_handle_rx(struct mbox_client *c, void *msg)
 	}
 }
 
+struct mbox_chan *imx_dsp_request_channel(struct imx_dsp_ipc *dsp_ipc, int idx)
+{
+	struct imx_dsp_chan *dsp_chan;
+
+	if (idx >= DSP_MU_CHAN_NUM)
+		return ERR_PTR(-EINVAL);
+
+	dsp_chan = &dsp_ipc->chans[idx];
+	dsp_chan->ch = mbox_request_channel_byname(&dsp_chan->cl, dsp_chan->name);
+	return dsp_chan->ch;
+}
+EXPORT_SYMBOL(imx_dsp_request_channel);
+
+void imx_dsp_free_channel(struct imx_dsp_ipc *dsp_ipc, int idx)
+{
+	struct imx_dsp_chan *dsp_chan;
+
+	if (idx >= DSP_MU_CHAN_NUM)
+		return;
+
+	dsp_chan = &dsp_ipc->chans[idx];
+	mbox_free_channel(dsp_chan->ch);
+}
+EXPORT_SYMBOL(imx_dsp_free_channel);
+
 static int imx_dsp_setup_channels(struct imx_dsp_ipc *dsp_ipc)
 {
 	struct device *dev = dsp_ipc->dev;
diff --git a/include/linux/firmware/imx/dsp.h b/include/linux/firmware/imx/dsp.h
index 7562099c9e46..4f7895a3b73c 100644
--- a/include/linux/firmware/imx/dsp.h
+++ b/include/linux/firmware/imx/dsp.h
@@ -55,6 +55,9 @@ static inline void *imx_dsp_get_data(struct imx_dsp_ipc *ipc)
 
 int imx_dsp_ring_doorbell(struct imx_dsp_ipc *dsp, unsigned int chan_idx);
 
+struct mbox_chan *imx_dsp_request_channel(struct imx_dsp_ipc *ipc, int idx);
+void imx_dsp_free_channel(struct imx_dsp_ipc *ipc, int idx);
+
 #else
 
 static inline int imx_dsp_ring_doorbell(struct imx_dsp_ipc *ipc,
@@ -63,5 +66,12 @@ static inline int imx_dsp_ring_doorbell(struct imx_dsp_ipc *ipc,
 	return -ENOTSUPP;
 }
 
+struct mbox_chan *imx_dsp_request_channel(struct imx_dsp_ipc *ipc, int idx)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+void imx_dsp_free_channel(struct imx_dsp_ipc *ipc, int idx) { }
+
 #endif
 #endif /* _IMX_DSP_IPC_H */
-- 
cgit v1.2.3


From cfeeea60af2f01c13b94d57a9bb1291e7bc181da Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 16 Nov 2020 12:57:13 +0200
Subject: bus: ti-sysc: Implement GPMC debug quirk to drop platform data

We need to enable no-reset-on-init quirk for GPMC if the config
option for CONFIG_OMAP_GPMC_DEBUG is set. Otherwise the GPMC
driver code is unable to show the bootloader configured timings.

Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 drivers/bus/ti-sysc.c                 | 10 ++++++++++
 include/linux/platform_data/ti-sysc.h |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index 88a5d22091f3..691cc39bfc5c 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -1383,6 +1383,8 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 		   SYSC_QUIRK_CLKDM_NOAUTO),
 	SYSC_QUIRK("dwc3", 0x488c0000, 0, 0x10, -ENODEV, 0x500a0200, 0xffffffff,
 		   SYSC_QUIRK_CLKDM_NOAUTO),
+	SYSC_QUIRK("gpmc", 0, 0, 0x10, 0x14, 0x00000060, 0xffffffff,
+		   SYSC_QUIRK_GPMC_DEBUG),
 	SYSC_QUIRK("hdmi", 0, 0, 0x10, -ENODEV, 0x50030200, 0xffffffff,
 		   SYSC_QUIRK_OPT_CLKS_NEEDED),
 	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x00000006, 0xffffffff,
@@ -1818,6 +1820,14 @@ static void sysc_init_module_quirks(struct sysc *ddata)
 		return;
 	}
 
+#ifdef CONFIG_OMAP_GPMC_DEBUG
+	if (ddata->cfg.quirks & SYSC_QUIRK_GPMC_DEBUG) {
+		ddata->cfg.quirks |= SYSC_QUIRK_NO_RESET_ON_INIT;
+
+		return;
+	}
+#endif
+
 	if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_I2C) {
 		ddata->pre_reset_quirk = sysc_pre_reset_quirk_i2c;
 		ddata->post_reset_quirk = sysc_post_reset_quirk_i2c;
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 240dce553a0b..fafc1beea504 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -50,6 +50,7 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSC_QUIRK_GPMC_DEBUG		BIT(26)
 #define SYSC_MODULE_QUIRK_ENA_RESETDONE	BIT(25)
 #define SYSC_MODULE_QUIRK_PRUSS		BIT(24)
 #define SYSC_MODULE_QUIRK_DSS_RESET	BIT(23)
-- 
cgit v1.2.3


From 13daf48978280ea8bce38f1e0598b913b09f5395 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 9 Nov 2020 22:53:16 +0200
Subject: gpiolib: Replace unsigned by unsigned int

Replace unsigned by unsigned int in GPIO library code.
Note, legacy API left untouched.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/gpio/gpiolib.c        | 16 ++++++++--------
 include/linux/gpio/consumer.h |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index c980ddcda833..fe31e7f1fb6e 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -211,7 +211,7 @@ static int gpiochip_find_base(int ngpio)
 int gpiod_get_direction(struct gpio_desc *desc)
 {
 	struct gpio_chip *gc;
-	unsigned offset;
+	unsigned int offset;
 	int ret;
 
 	gc = gpiod_to_chip(desc);
@@ -1333,7 +1333,7 @@ void gpiochip_irq_domain_deactivate(struct irq_domain *domain,
 }
 EXPORT_SYMBOL_GPL(gpiochip_irq_domain_deactivate);
 
-static int gpiochip_to_irq(struct gpio_chip *gc, unsigned offset)
+static int gpiochip_to_irq(struct gpio_chip *gc, unsigned int offset)
 {
 	struct irq_domain *domain = gc->irq.domain;
 
@@ -1635,7 +1635,7 @@ static inline void gpiochip_irqchip_free_valid_mask(struct gpio_chip *gc)
  * @gc: the gpiochip owning the GPIO
  * @offset: the offset of the GPIO to request for GPIO function
  */
-int gpiochip_generic_request(struct gpio_chip *gc, unsigned offset)
+int gpiochip_generic_request(struct gpio_chip *gc, unsigned int offset)
 {
 #ifdef CONFIG_PINCTRL
 	if (list_empty(&gc->gpiodev->pin_ranges))
@@ -1651,7 +1651,7 @@ EXPORT_SYMBOL_GPL(gpiochip_generic_request);
  * @gc: the gpiochip to request the gpio function for
  * @offset: the offset of the GPIO to free from GPIO function
  */
-void gpiochip_generic_free(struct gpio_chip *gc, unsigned offset)
+void gpiochip_generic_free(struct gpio_chip *gc, unsigned int offset)
 {
 	pinctrl_gpio_free(gc->gpiodev->base + offset);
 }
@@ -1663,7 +1663,7 @@ EXPORT_SYMBOL_GPL(gpiochip_generic_free);
  * @offset: the offset of the GPIO to apply the configuration
  * @config: the configuration to be applied
  */
-int gpiochip_generic_config(struct gpio_chip *gc, unsigned offset,
+int gpiochip_generic_config(struct gpio_chip *gc, unsigned int offset,
 			    unsigned long config)
 {
 	return pinctrl_gpio_set_config(gc->gpiodev->base + offset, config);
@@ -1993,7 +1993,7 @@ void gpiod_free(struct gpio_desc *desc)
  * help with diagnostics, and knowing that the signal is used as a GPIO
  * can help avoid accidentally multiplexing it to another controller.
  */
-const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned offset)
+const char *gpiochip_is_requested(struct gpio_chip *gc, unsigned int offset)
 {
 	struct gpio_desc *desc;
 
@@ -2097,7 +2097,7 @@ static int gpio_set_config(struct gpio_desc *desc, enum pin_config_param mode)
 {
 	struct gpio_chip *gc = desc->gdev->chip;
 	unsigned long config;
-	unsigned arg;
+	unsigned int arg;
 
 	switch (mode) {
 	case PIN_CONFIG_BIAS_PULL_DOWN:
@@ -2353,7 +2353,7 @@ EXPORT_SYMBOL_GPL(gpiod_set_config);
  * 0 on success, %-ENOTSUPP if the controller doesn't support setting the
  * debounce time.
  */
-int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce)
+int gpiod_set_debounce(struct gpio_desc *desc, unsigned int debounce)
 {
 	unsigned long config;
 
diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index 901aab89d025..ef49307611d2 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -158,7 +158,7 @@ int gpiod_set_raw_array_value_cansleep(unsigned int array_size,
 				       unsigned long *value_bitmap);
 
 int gpiod_set_config(struct gpio_desc *desc, unsigned long config);
-int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce);
+int gpiod_set_debounce(struct gpio_desc *desc, unsigned int debounce);
 int gpiod_set_transitory(struct gpio_desc *desc, bool transitory);
 void gpiod_toggle_active_low(struct gpio_desc *desc);
 
@@ -481,7 +481,7 @@ static inline int gpiod_set_config(struct gpio_desc *desc, unsigned long config)
 	return -ENOSYS;
 }
 
-static inline int gpiod_set_debounce(struct gpio_desc *desc, unsigned debounce)
+static inline int gpiod_set_debounce(struct gpio_desc *desc, unsigned int debounce)
 {
 	/* GPIO can never have been requested */
 	WARN_ON(desc);
-- 
cgit v1.2.3


From 557acb3d2cd9c82de19f944f6cc967a347735385 Mon Sep 17 00:00:00 2001
From: Amjad Ouled-Ameur <aouledameur@baylibre.com>
Date: Fri, 13 Nov 2020 00:00:43 +0100
Subject: reset: make shared pulsed reset controls re-triggerable

The current reset framework API does not allow to release what is done by
reset_control_reset(), IOW decrement triggered_count. Add the new
reset_control_rearm() call to do so.

When reset_control_reset() has been called once, the counter
triggered_count, in the reset framework, is incremented i.e the resource
under the reset is in-use and the reset should not be done again.
reset_control_rearm() would be the way to state that the resource is
no longer used and, that from the caller's perspective, the reset can be
fired again if necessary.

Signed-off-by: Amjad Ouled-Ameur <aouledameur@baylibre.com>
Reported-by: Jerome Brunet <jbrunet@baylibre.com>
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
---
 drivers/reset/core.c  | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/reset.h |  1 +
 2 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/reset/core.c b/drivers/reset/core.c
index a2df88e90011..34e89aa0fb5e 100644
--- a/drivers/reset/core.c
+++ b/drivers/reset/core.c
@@ -208,6 +208,39 @@ static int reset_control_array_reset(struct reset_control_array *resets)
 	return 0;
 }
 
+static int reset_control_array_rearm(struct reset_control_array *resets)
+{
+	struct reset_control *rstc;
+	int i;
+
+	for (i = 0; i < resets->num_rstcs; i++) {
+		rstc = resets->rstc[i];
+
+		if (!rstc)
+			continue;
+
+		if (WARN_ON(IS_ERR(rstc)))
+			return -EINVAL;
+
+		if (rstc->shared) {
+			if (WARN_ON(atomic_read(&rstc->deassert_count) != 0))
+				return -EINVAL;
+		} else {
+			if (!rstc->acquired)
+				return -EPERM;
+		}
+	}
+
+	for (i = 0; i < resets->num_rstcs; i++) {
+		rstc = resets->rstc[i];
+
+		if (rstc && rstc->shared)
+			WARN_ON(atomic_dec_return(&rstc->triggered_count) < 0);
+	}
+
+	return 0;
+}
+
 static int reset_control_array_assert(struct reset_control_array *resets)
 {
 	int ret, i;
@@ -325,6 +358,46 @@ int reset_control_reset(struct reset_control *rstc)
 }
 EXPORT_SYMBOL_GPL(reset_control_reset);
 
+/**
+ * reset_control_rearm - allow shared reset line to be re-triggered"
+ * @rstc: reset controller
+ *
+ * On a shared reset line the actual reset pulse is only triggered once for the
+ * lifetime of the reset_control instance, except if this call is used.
+ *
+ * Calls to this function must be balanced with calls to reset_control_reset,
+ * a warning is thrown in case triggered_count ever dips below 0.
+ *
+ * Consumers must not use reset_control_(de)assert on shared reset lines when
+ * reset_control_reset or reset_control_rearm have been used.
+ *
+ * If rstc is NULL the function will just return 0.
+ */
+int reset_control_rearm(struct reset_control *rstc)
+{
+	if (!rstc)
+		return 0;
+
+	if (WARN_ON(IS_ERR(rstc)))
+		return -EINVAL;
+
+	if (reset_control_is_array(rstc))
+		return reset_control_array_rearm(rstc_to_array(rstc));
+
+	if (rstc->shared) {
+		if (WARN_ON(atomic_read(&rstc->deassert_count) != 0))
+			return -EINVAL;
+
+		WARN_ON(atomic_dec_return(&rstc->triggered_count) < 0);
+	} else {
+		if (!rstc->acquired)
+			return -EPERM;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(reset_control_rearm);
+
 /**
  * reset_control_assert - asserts the reset line
  * @rstc: reset controller
diff --git a/include/linux/reset.h b/include/linux/reset.h
index 05aa9f440f48..439fec7112a9 100644
--- a/include/linux/reset.h
+++ b/include/linux/reset.h
@@ -13,6 +13,7 @@ struct reset_control;
 #ifdef CONFIG_RESET_CONTROLLER
 
 int reset_control_reset(struct reset_control *rstc);
+int reset_control_rearm(struct reset_control *rstc);
 int reset_control_assert(struct reset_control *rstc);
 int reset_control_deassert(struct reset_control *rstc);
 int reset_control_status(struct reset_control *rstc);
-- 
cgit v1.2.3


From 16fee29b07358293f135759d9fdbf1267da57ebd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 6 Nov 2020 17:02:17 +0100
Subject: dma-mapping: remove the dma_direct_set_offset export

Drop the dma_direct_set_offset export and move the declaration to
dma-map-ops.h now that the Allwinner drivers have stopped calling it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
---
 arch/arm/mach-keystone/keystone.c | 2 +-
 arch/arm/mach-omap1/usb.c         | 2 +-
 arch/sh/drivers/pci/pcie-sh7786.c | 2 +-
 arch/x86/pci/sta2x11-fixup.c      | 3 ++-
 include/linux/dma-map-ops.h       | 3 +++
 include/linux/dma-mapping.h       | 7 -------
 kernel/dma/direct.c               | 1 -
 7 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-keystone/keystone.c b/arch/arm/mach-keystone/keystone.c
index 09a65c2dfd73..cd711bfc591f 100644
--- a/arch/arm/mach-keystone/keystone.c
+++ b/arch/arm/mach-keystone/keystone.c
@@ -8,7 +8,7 @@
  */
 #include <linux/io.h>
 #include <linux/of.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
 #include <linux/init.h>
 #include <linux/of_platform.h>
 #include <linux/of_address.h>
diff --git a/arch/arm/mach-omap1/usb.c b/arch/arm/mach-omap1/usb.c
index ba8566204ea9..86d3b3c157af 100644
--- a/arch/arm/mach-omap1/usb.c
+++ b/arch/arm/mach-omap1/usb.c
@@ -9,7 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
 #include <linux/io.h>
 
 #include <asm/irq.h>
diff --git a/arch/sh/drivers/pci/pcie-sh7786.c b/arch/sh/drivers/pci/pcie-sh7786.c
index 4468289ab2ca..4d499476c33a 100644
--- a/arch/sh/drivers/pci/pcie-sh7786.c
+++ b/arch/sh/drivers/pci/pcie-sh7786.c
@@ -12,7 +12,7 @@
 #include <linux/io.h>
 #include <linux/async.h>
 #include <linux/delay.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-map-ops.h>
 #include <linux/slab.h>
 #include <linux/clk.h>
 #include <linux/sh_clk.h>
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 5701d5ba3df4..7d2525691854 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -11,7 +11,8 @@
 #include <linux/pci_ids.h>
 #include <linux/export.h>
 #include <linux/list.h>
-#include <linux/dma-direct.h>
+#include <linux/dma-map-ops.h>
+#include <linux/swiotlb.h>
 #include <asm/iommu.h>
 
 #define STA2X11_SWIOTLB_SIZE (4*1024*1024)
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index a5f89fc4d6df..03925e438ec3 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -226,6 +226,9 @@ struct page *dma_alloc_from_pool(struct device *dev, size_t size,
 		bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t));
 bool dma_free_from_pool(struct device *dev, void *start, size_t size);
 
+int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
+		dma_addr_t dma_start, u64 size);
+
 #ifdef CONFIG_ARCH_HAS_DMA_COHERENCE_H
 #include <asm/dma-coherence.h>
 #elif defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 956151052d45..199d85285246 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -558,13 +558,6 @@ static inline int dma_mmap_wc(struct device *dev,
 #define dma_unmap_len_set(PTR, LEN_NAME, VAL)    do { } while (0)
 #endif
 
-/*
- * Legacy interface to set up the dma offset map.  Drivers really should not
- * actually use it, but we have a few legacy cases left.
- */
-int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
-		dma_addr_t dma_start, u64 size);
-
 extern const struct dma_map_ops dma_virt_ops;
 
 #endif /* _LINUX_DMA_MAPPING_H */
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 06c111544f61..002268262c9a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -547,4 +547,3 @@ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
 	dev->dma_range_map = map;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(dma_direct_set_offset);
-- 
cgit v1.2.3


From 4abb1e5b63ac3281275315fc6b0cde0b9c2e2e42 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 11 Nov 2020 15:53:17 +0100
Subject: powerpc/mm: factor out creating/removing linear mapping

We want to stop abusing memory hotplug infrastructure in memtrace code
to perform allocations and remove the linear mapping. Instead we will use
alloc_contig_pages() and remove the linear mapping manually.

Let's factor out creating/removing the linear mapping into
arch_create_linear_mapping() / arch_remove_linear_mapping() - so in the
future, we might be able to have whole arch_add_memory() /
arch_remove_memory() be implemented in common code.

Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20201111145322.15793-4-david@redhat.com
---
 arch/powerpc/mm/mem.c          | 41 ++++++++++++++++++++++++++++-------------
 include/linux/memory_hotplug.h |  3 +++
 2 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 01ec2a252f09..8a86d81f8df0 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -120,34 +120,26 @@ static void flush_dcache_range_chunked(unsigned long start, unsigned long stop,
 	}
 }
 
-int __ref arch_add_memory(int nid, u64 start, u64 size,
-			  struct mhp_params *params)
+int __ref arch_create_linear_mapping(int nid, u64 start, u64 size,
+				     struct mhp_params *params)
 {
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int rc;
 
 	start = (unsigned long)__va(start);
 	rc = create_section_mapping(start, start + size, nid,
 				    params->pgprot);
 	if (rc) {
-		pr_warn("Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
+		pr_warn("Unable to create linear mapping for 0x%llx..0x%llx: %d\n",
 			start, start + size, rc);
 		return -EFAULT;
 	}
-
-	return __add_pages(nid, start_pfn, nr_pages, params);
+	return 0;
 }
 
-void __ref arch_remove_memory(int nid, u64 start, u64 size,
-			     struct vmem_altmap *altmap)
+void __ref arch_remove_linear_mapping(u64 start, u64 size)
 {
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	__remove_pages(start_pfn, nr_pages, altmap);
-
 	/* Remove htab bolted mappings for this section of memory */
 	start = (unsigned long)__va(start);
 	flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE);
@@ -160,6 +152,29 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 	 */
 	vm_unmap_aliases();
 }
+
+int __ref arch_add_memory(int nid, u64 start, u64 size,
+			  struct mhp_params *params)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	int rc;
+
+	rc = arch_create_linear_mapping(nid, start, size, params);
+	if (rc)
+		return rc;
+	return __add_pages(nid, start_pfn, nr_pages, params);
+}
+
+void __ref arch_remove_memory(int nid, u64 start, u64 size,
+			      struct vmem_altmap *altmap)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	__remove_pages(start_pfn, nr_pages, altmap);
+	arch_remove_linear_mapping(start, size);
+}
 #endif
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d65c6fdc5cfc..00b9e9bd3850 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -375,6 +375,9 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
 extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
 		unsigned long nr_pages);
+extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
+				      struct mhp_params *params);
+void arch_remove_linear_mapping(u64 start, u64 size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
-- 
cgit v1.2.3


From 295992fb815e791d14b18ef7cdbbaf1a76211a31 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Mon, 14 Sep 2020 15:09:33 +0200
Subject: mm: introduce vma_set_file function v5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the new vma_set_file() function to allow changing
vma->vm_file with the necessary refcount dance.

v2: add more users of this.
v3: add missing EXPORT_SYMBOL, rebase on mmap cleanup,
    add comments why we drop the reference on two occasions.
v4: make it clear that changing an anonymous vma is illegal.
v5: move vma_set_file to mm/util.c

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch> (v2)
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Link: https://patchwork.freedesktop.org/patch/399360/
---
 drivers/dma-buf/dma-buf.c                  |  3 +--
 drivers/gpu/drm/etnaviv/etnaviv_gem.c      |  4 +---
 drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c |  3 +--
 drivers/gpu/drm/i915/gem/i915_gem_mman.c   |  5 +++--
 drivers/gpu/drm/msm/msm_gem.c              |  4 +---
 drivers/gpu/drm/omapdrm/omap_gem.c         |  3 +--
 drivers/gpu/drm/vgem/vgem_drv.c            |  3 +--
 drivers/staging/android/ashmem.c           |  6 +++---
 include/linux/mm.h                         |  2 ++
 mm/util.c                                  | 12 ++++++++++++
 10 files changed, 26 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 282bd8b84170..e63684d4cd90 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1183,8 +1183,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
 		return -EINVAL;
 
 	/* readjust the vma */
-	fput(vma->vm_file);
-	vma->vm_file = get_file(dmabuf->file);
+	vma_set_file(vma, dmabuf->file);
 	vma->vm_pgoff = pgoff;
 
 	return dmabuf->ops->mmap(dmabuf, vma);
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index bbd235473645..6d38c5c17f23 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -145,10 +145,8 @@ static int etnaviv_gem_mmap_obj(struct etnaviv_gem_object *etnaviv_obj,
 		 * address_space (so unmap_mapping_range does what we want,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
-		fput(vma->vm_file);
-		get_file(etnaviv_obj->base.filp);
 		vma->vm_pgoff = 0;
-		vma->vm_file  = etnaviv_obj->base.filp;
+		vma_set_file(vma, etnaviv_obj->base.filp);
 
 		vma->vm_page_prot = vm_page_prot;
 	}
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
index 0dd477e56573..04e9c04545ad 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -114,8 +114,7 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *
 	if (ret)
 		return ret;
 
-	fput(vma->vm_file);
-	vma->vm_file = get_file(obj->base.filp);
+	vma_set_file(vma, obj->base.filp);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index 3d69e51f3e4d..ec28a6cde49b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -893,8 +893,9 @@ int i915_gem_mmap(struct file *filp, struct vm_area_struct *vma)
 	 * requires avoiding extraneous references to their filp, hence why
 	 * we prefer to use an anonymous file for their mmaps.
 	 */
-	fput(vma->vm_file);
-	vma->vm_file = anon;
+	vma_set_file(vma, anon);
+	/* Drop the initial creation reference, the vma is now holding one. */
+	fput(anon);
 
 	switch (mmo->mmap_type) {
 	case I915_MMAP_TYPE_WC:
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 2e1bce7c0b19..311721ceee50 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -212,10 +212,8 @@ int msm_gem_mmap_obj(struct drm_gem_object *obj,
 		 * address_space (so unmap_mapping_range does what we want,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
-		fput(vma->vm_file);
-		get_file(obj->filp);
 		vma->vm_pgoff = 0;
-		vma->vm_file  = obj->filp;
+		vma_set_file(vma, obj->filp);
 
 		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	}
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 68c271f4250b..30d299ca8795 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -564,9 +564,8 @@ int omap_gem_mmap_obj(struct drm_gem_object *obj,
 		 * address_space (so unmap_mapping_range does what we want,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
-		fput(vma->vm_file);
 		vma->vm_pgoff = 0;
-		vma->vm_file  = get_file(obj->filp);
+		vma_set_file(vma, obj->filp);
 
 		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	}
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index 9a413091abb6..f8635ccaf9a1 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -403,8 +403,7 @@ static int vgem_prime_mmap(struct drm_gem_object *obj,
 	if (ret)
 		return ret;
 
-	fput(vma->vm_file);
-	vma->vm_file = get_file(obj->filp);
+	vma_set_file(vma, obj->filp);
 	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 	vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
 
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c
index 10b4be1f3e78..4789d36ddfd3 100644
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -450,9 +450,9 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
 		vma_set_anonymous(vma);
 	}
 
-	if (vma->vm_file)
-		fput(vma->vm_file);
-	vma->vm_file = asma->file;
+	vma_set_file(vma, asma->file);
+	/* XXX: merge this with the get_file() above if possible */
+	fput(asma->file);
 
 out:
 	mutex_unlock(&ashmem_mutex);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index db6ae4d3fb4e..47bff16c182d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2719,6 +2719,8 @@ static inline void vma_set_page_prot(struct vm_area_struct *vma)
 }
 #endif
 
+void vma_set_file(struct vm_area_struct *vma, struct file *file);
+
 #ifdef CONFIG_NUMA_BALANCING
 unsigned long change_prot_numa(struct vm_area_struct *vma,
 			unsigned long start, unsigned long end);
diff --git a/mm/util.c b/mm/util.c
index 4ddb6e186dd5..8c9b7d1e7c49 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -311,6 +311,18 @@ int vma_is_stack_for_current(struct vm_area_struct *vma)
 	return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
 }
 
+/*
+ * Change backing file, only valid to use during initial VMA setup.
+ */
+void vma_set_file(struct vm_area_struct *vma, struct file *file)
+{
+	/* Changing an anonymous vma with this is illegal */
+	get_file(file);
+	swap(vma->vm_file, file);
+	fput(file);
+}
+EXPORT_SYMBOL(vma_set_file);
+
 #ifndef STACK_RND_MASK
 #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
 #endif
-- 
cgit v1.2.3


From 25ece30561d247b2931b0d11d92e9c976a668771 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Mon, 9 Nov 2020 17:34:05 +0100
Subject: rtc: nvmem: remove nvram ABI

The nvram sysfs attributes have been deprecated at least since v4.13, more
than 3 years ago and nobody ever complained about the deprecation warning.

Remove the sysfs attributes now.

[Bartosz: remove the declaration of rtc_nvmem_unregister()]

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Link: https://lore.kernel.org/r/20201109163409.24301-5-brgl@bgdev.pl
---
 drivers/rtc/class.c        |  2 --
 drivers/rtc/nvmem.c        | 82 +---------------------------------------------
 drivers/rtc/rtc-cmos.c     |  1 -
 drivers/rtc/rtc-ds1305.c   |  1 -
 drivers/rtc/rtc-ds1307.c   |  1 -
 drivers/rtc/rtc-ds1343.c   |  1 -
 drivers/rtc/rtc-ds1511.c   |  2 --
 drivers/rtc/rtc-ds1553.c   |  1 -
 drivers/rtc/rtc-ds1685.c   |  1 -
 drivers/rtc/rtc-ds1742.c   |  1 -
 drivers/rtc/rtc-m48t59.c   |  1 -
 drivers/rtc/rtc-m48t86.c   |  1 -
 drivers/rtc/rtc-rp5c01.c   |  1 -
 drivers/rtc/rtc-rv8803.c   |  1 -
 drivers/rtc/rtc-stk17ta8.c |  1 -
 drivers/rtc/rtc-tx4939.c   |  1 -
 include/linux/rtc.h        |  6 ----
 17 files changed, 1 insertion(+), 104 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 7c88d190c51f..a99b7d24b77c 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -339,8 +339,6 @@ static void devm_rtc_release_device(struct device *dev, void *res)
 {
 	struct rtc_device *rtc = *(struct rtc_device **)res;
 
-	rtc_nvmem_unregister(rtc);
-
 	if (rtc->registered)
 		rtc_device_unregister(rtc);
 	else
diff --git a/drivers/rtc/nvmem.c b/drivers/rtc/nvmem.c
index 4312096c7738..5e0b178a3b65 100644
--- a/drivers/rtc/nvmem.c
+++ b/drivers/rtc/nvmem.c
@@ -9,74 +9,7 @@
 #include <linux/types.h>
 #include <linux/nvmem-consumer.h>
 #include <linux/rtc.h>
-#include <linux/slab.h>
-#include <linux/sysfs.h>
 
-/*
- * Deprecated ABI compatibility, this should be removed at some point
- */
-
-static const char nvram_warning[] = "Deprecated ABI, please use nvmem";
-
-static ssize_t
-rtc_nvram_read(struct file *filp, struct kobject *kobj,
-	       struct bin_attribute *attr,
-	       char *buf, loff_t off, size_t count)
-{
-	dev_warn_once(kobj_to_dev(kobj), nvram_warning);
-
-	return nvmem_device_read(attr->private, off, count, buf);
-}
-
-static ssize_t
-rtc_nvram_write(struct file *filp, struct kobject *kobj,
-		struct bin_attribute *attr,
-		char *buf, loff_t off, size_t count)
-{
-	dev_warn_once(kobj_to_dev(kobj), nvram_warning);
-
-	return nvmem_device_write(attr->private, off, count, buf);
-}
-
-static int rtc_nvram_register(struct rtc_device *rtc,
-			      struct nvmem_device *nvmem, size_t size)
-{
-	int err;
-
-	rtc->nvram = kzalloc(sizeof(*rtc->nvram), GFP_KERNEL);
-	if (!rtc->nvram)
-		return -ENOMEM;
-
-	rtc->nvram->attr.name = "nvram";
-	rtc->nvram->attr.mode = 0644;
-	rtc->nvram->private = nvmem;
-
-	sysfs_bin_attr_init(rtc->nvram);
-
-	rtc->nvram->read = rtc_nvram_read;
-	rtc->nvram->write = rtc_nvram_write;
-	rtc->nvram->size = size;
-
-	err = sysfs_create_bin_file(&rtc->dev.parent->kobj,
-				    rtc->nvram);
-	if (err) {
-		kfree(rtc->nvram);
-		rtc->nvram = NULL;
-	}
-
-	return err;
-}
-
-static void rtc_nvram_unregister(struct rtc_device *rtc)
-{
-	sysfs_remove_bin_file(&rtc->dev.parent->kobj, rtc->nvram);
-	kfree(rtc->nvram);
-	rtc->nvram = NULL;
-}
-
-/*
- * New ABI, uses nvmem
- */
 int rtc_nvmem_register(struct rtc_device *rtc,
 		       struct nvmem_config *nvmem_config)
 {
@@ -88,20 +21,7 @@ int rtc_nvmem_register(struct rtc_device *rtc,
 	nvmem_config->dev = rtc->dev.parent;
 	nvmem_config->owner = rtc->owner;
 	nvmem = devm_nvmem_register(rtc->dev.parent, nvmem_config);
-	if (IS_ERR(nvmem))
-		return PTR_ERR(nvmem);
-
-	/* Register the old ABI */
-	if (rtc->nvram_old_abi)
-		rtc_nvram_register(rtc, nvmem, nvmem_config->size);
 
-	return 0;
+	return PTR_ERR_OR_ZERO(nvmem);
 }
 EXPORT_SYMBOL_GPL(rtc_nvmem_register);
-
-void rtc_nvmem_unregister(struct rtc_device *rtc)
-{
-	/* unregister the old ABI */
-	if (rtc->nvram)
-		rtc_nvram_unregister(rtc);
-}
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index c633319cdb91..adca0de76e53 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -863,7 +863,6 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
 		cmos_rtc.rtc->ops = &cmos_rtc_ops_no_alarm;
 	}
 
-	cmos_rtc.rtc->nvram_old_abi = true;
 	retval = rtc_register_device(cmos_rtc.rtc);
 	if (retval)
 		goto cleanup2;
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c
index a3d790889eea..a1ed539d41b4 100644
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -694,7 +694,6 @@ static int ds1305_probe(struct spi_device *spi)
 	ds1305->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
 	ds1305_nvmem_cfg.priv = ds1305;
-	ds1305->rtc->nvram_old_abi = true;
 	status = rtc_register_device(ds1305->rtc);
 	if (status)
 		return status;
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index fdf25db1b1b3..e359cbf7882b 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -2016,7 +2016,6 @@ static int ds1307_probe(struct i2c_client *client,
 			.priv = ds1307,
 		};
 
-		ds1307->rtc->nvram_old_abi = true;
 		rtc_nvmem_register(ds1307->rtc, &nvmem_cfg);
 	}
 
diff --git a/drivers/rtc/rtc-ds1343.c b/drivers/rtc/rtc-ds1343.c
index ba143423875b..e7604e844cbd 100644
--- a/drivers/rtc/rtc-ds1343.c
+++ b/drivers/rtc/rtc-ds1343.c
@@ -399,7 +399,6 @@ static int ds1343_probe(struct spi_device *spi)
 	if (IS_ERR(priv->rtc))
 		return PTR_ERR(priv->rtc);
 
-	priv->rtc->nvram_old_abi = true;
 	priv->rtc->ops = &ds1343_rtc_ops;
 	priv->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	priv->rtc->range_max = RTC_TIMESTAMP_END_2099;
diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index a63872c4c76d..33c483d759c8 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -466,8 +466,6 @@ static int ds1511_rtc_probe(struct platform_device *pdev)
 
 	pdata->rtc->ops = &ds1511_rtc_ops;
 
-	pdata->rtc->nvram_old_abi = true;
-
 	ret = rtc_register_device(pdata->rtc);
 	if (ret)
 		return ret;
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index cdf5e05b9489..c6a5563504e5 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -294,7 +294,6 @@ static int ds1553_rtc_probe(struct platform_device *pdev)
 		return PTR_ERR(pdata->rtc);
 
 	pdata->rtc->ops = &ds1553_rtc_ops;
-	pdata->rtc->nvram_old_abi = true;
 
 	ret = rtc_register_device(pdata->rtc);
 	if (ret)
diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c
index dfbd7b88b2b9..9043c96e8845 100644
--- a/drivers/rtc/rtc-ds1685.c
+++ b/drivers/rtc/rtc-ds1685.c
@@ -1316,7 +1316,6 @@ ds1685_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	rtc_dev->nvram_old_abi = true;
 	nvmem_cfg.priv = rtc;
 	ret = rtc_nvmem_register(rtc_dev, &nvmem_cfg);
 	if (ret)
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 2b949f0dbaa9..291bbed90ef8 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -190,7 +190,6 @@ static int ds1742_rtc_probe(struct platform_device *pdev)
 		return PTR_ERR(rtc);
 
 	rtc->ops = &ds1742_rtc_ops;
-	rtc->nvram_old_abi = true;
 
 	ret = rtc_register_device(rtc);
 	if (ret)
diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c
index 67e218758a8b..ee1d8f0146fd 100644
--- a/drivers/rtc/rtc-m48t59.c
+++ b/drivers/rtc/rtc-m48t59.c
@@ -463,7 +463,6 @@ static int m48t59_rtc_probe(struct platform_device *pdev)
 	if (IS_ERR(m48t59->rtc))
 		return PTR_ERR(m48t59->rtc);
 
-	m48t59->rtc->nvram_old_abi = true;
 	m48t59->rtc->ops = ops;
 
 	nvmem_cfg.size = pdata->offset;
diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c
index 75a0e73071d8..2b1135590dd5 100644
--- a/drivers/rtc/rtc-m48t86.c
+++ b/drivers/rtc/rtc-m48t86.c
@@ -254,7 +254,6 @@ static int m48t86_rtc_probe(struct platform_device *pdev)
 		return PTR_ERR(info->rtc);
 
 	info->rtc->ops = &m48t86_rtc_ops;
-	info->rtc->nvram_old_abi = true;
 
 	err = rtc_register_device(info->rtc);
 	if (err)
diff --git a/drivers/rtc/rtc-rp5c01.c b/drivers/rtc/rtc-rp5c01.c
index 8776eadbdd3a..a69e8adcc4a1 100644
--- a/drivers/rtc/rtc-rp5c01.c
+++ b/drivers/rtc/rtc-rp5c01.c
@@ -251,7 +251,6 @@ static int __init rp5c01_rtc_probe(struct platform_device *dev)
 		return PTR_ERR(rtc);
 
 	rtc->ops = &rp5c01_rtc_ops;
-	rtc->nvram_old_abi = true;
 
 	priv->rtc = rtc;
 
diff --git a/drivers/rtc/rtc-rv8803.c b/drivers/rtc/rtc-rv8803.c
index c6d8e3425688..1d888da48c7c 100644
--- a/drivers/rtc/rtc-rv8803.c
+++ b/drivers/rtc/rtc-rv8803.c
@@ -585,7 +585,6 @@ static int rv8803_probe(struct i2c_client *client,
 	}
 
 	rv8803->rtc->ops = &rv8803_rtc_ops;
-	rv8803->rtc->nvram_old_abi = true;
 	rv8803->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rv8803->rtc->range_max = RTC_TIMESTAMP_END_2099;
 	err = rtc_register_device(rv8803->rtc);
diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c
index 01a45044f468..1ccf0d5d05b4 100644
--- a/drivers/rtc/rtc-stk17ta8.c
+++ b/drivers/rtc/rtc-stk17ta8.c
@@ -311,7 +311,6 @@ static int stk17ta8_rtc_probe(struct platform_device *pdev)
 		return PTR_ERR(pdata->rtc);
 
 	pdata->rtc->ops = &stk17ta8_rtc_ops;
-	pdata->rtc->nvram_old_abi = true;
 
 	nvmem_cfg.priv = pdata;
 	ret = rtc_nvmem_register(pdata->rtc, &nvmem_cfg);
diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c
index 715b82981279..abbb62b14d7a 100644
--- a/drivers/rtc/rtc-tx4939.c
+++ b/drivers/rtc/rtc-tx4939.c
@@ -266,7 +266,6 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev)
 		return PTR_ERR(rtc);
 
 	rtc->ops = &tx4939_rtc_ops;
-	rtc->nvram_old_abi = true;
 	rtc->range_max = U32_MAX;
 
 	pdata->rtc = rtc;
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 22d1575e4991..0983ab9faffb 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -120,10 +120,6 @@ struct rtc_device {
 
 	bool registered;
 
-	/* Old ABI support */
-	bool nvram_old_abi;
-	struct bin_attribute *nvram;
-
 	time64_t range_min;
 	timeu64_t range_max;
 	time64_t start_secs;
@@ -250,14 +246,12 @@ extern int rtc_hctosys_ret;
 #ifdef CONFIG_RTC_NVMEM
 int rtc_nvmem_register(struct rtc_device *rtc,
 		       struct nvmem_config *nvmem_config);
-void rtc_nvmem_unregister(struct rtc_device *rtc);
 #else
 static inline int rtc_nvmem_register(struct rtc_device *rtc,
 				     struct nvmem_config *nvmem_config)
 {
 	return 0;
 }
-static inline void rtc_nvmem_unregister(struct rtc_device *rtc) {}
 #endif
 
 #ifdef CONFIG_RTC_INTF_SYSFS
-- 
cgit v1.2.3


From 3a905c2d9544a418953d6c18668f0f853fbd9be9 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Mon, 9 Nov 2020 17:34:06 +0100
Subject: rtc: add devm_ prefix to rtc_nvmem_register()

rtc_nvmem_register() is a managed interface. It doesn't require any
release function to be called at driver detach. To avoid confusing
driver authors, let's rename it to devm_rtc_nvmem_register() and add it
to the list of managed interfaces in Documentation/.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201109163409.24301-6-brgl@bgdev.pl
---
 Documentation/driver-api/driver-model/devres.rst | 1 +
 drivers/rtc/nvmem.c                              | 4 ++--
 drivers/rtc/rtc-cmos.c                           | 2 +-
 drivers/rtc/rtc-ds1305.c                         | 2 +-
 drivers/rtc/rtc-ds1307.c                         | 2 +-
 drivers/rtc/rtc-ds1343.c                         | 2 +-
 drivers/rtc/rtc-ds1511.c                         | 2 +-
 drivers/rtc/rtc-ds1553.c                         | 2 +-
 drivers/rtc/rtc-ds1685.c                         | 2 +-
 drivers/rtc/rtc-ds1742.c                         | 2 +-
 drivers/rtc/rtc-ds3232.c                         | 2 +-
 drivers/rtc/rtc-isl12026.c                       | 2 +-
 drivers/rtc/rtc-isl1208.c                        | 2 +-
 drivers/rtc/rtc-m48t59.c                         | 2 +-
 drivers/rtc/rtc-m48t86.c                         | 2 +-
 drivers/rtc/rtc-meson.c                          | 2 +-
 drivers/rtc/rtc-omap.c                           | 2 +-
 drivers/rtc/rtc-pcf2127.c                        | 2 +-
 drivers/rtc/rtc-pcf85063.c                       | 2 +-
 drivers/rtc/rtc-pcf85363.c                       | 2 +-
 drivers/rtc/rtc-rp5c01.c                         | 2 +-
 drivers/rtc/rtc-rv3028.c                         | 4 ++--
 drivers/rtc/rtc-rv3029c2.c                       | 2 +-
 drivers/rtc/rtc-rv3032.c                         | 4 ++--
 drivers/rtc/rtc-rv8803.c                         | 2 +-
 drivers/rtc/rtc-rx8581.c                         | 2 +-
 drivers/rtc/rtc-stk17ta8.c                       | 2 +-
 drivers/rtc/rtc-tx4939.c                         | 2 +-
 include/linux/rtc.h                              | 8 ++++----
 29 files changed, 35 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 6ffc0f07404f..5df7ba54a4ba 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -414,6 +414,7 @@ RESET
 RTC
   devm_rtc_device_register()
   devm_rtc_allocate_device()
+  devm_rtc_nvmem_register()
 
 SERDEV
   devm_serdev_device_open()
diff --git a/drivers/rtc/nvmem.c b/drivers/rtc/nvmem.c
index 5e0b178a3b65..7502deb6390e 100644
--- a/drivers/rtc/nvmem.c
+++ b/drivers/rtc/nvmem.c
@@ -10,7 +10,7 @@
 #include <linux/nvmem-consumer.h>
 #include <linux/rtc.h>
 
-int rtc_nvmem_register(struct rtc_device *rtc,
+int devm_rtc_nvmem_register(struct rtc_device *rtc,
 		       struct nvmem_config *nvmem_config)
 {
 	struct nvmem_device *nvmem;
@@ -24,4 +24,4 @@ int rtc_nvmem_register(struct rtc_device *rtc,
 
 	return PTR_ERR_OR_ZERO(nvmem);
 }
-EXPORT_SYMBOL_GPL(rtc_nvmem_register);
+EXPORT_SYMBOL_GPL(devm_rtc_nvmem_register);
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index adca0de76e53..eea91c1538aa 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -869,7 +869,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
 
 	/* export at least the first block of NVRAM */
 	nvmem_cfg.size = address_space - NVRAM_OFFSET;
-	if (rtc_nvmem_register(cmos_rtc.rtc, &nvmem_cfg))
+	if (devm_rtc_nvmem_register(cmos_rtc.rtc, &nvmem_cfg))
 		dev_err(dev, "nvmem registration failed\n");
 
 	dev_info(dev, "%s%s, %d bytes nvram%s\n",
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c
index a1ed539d41b4..a4e768261b43 100644
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -698,7 +698,7 @@ static int ds1305_probe(struct spi_device *spi)
 	if (status)
 		return status;
 
-	rtc_nvmem_register(ds1305->rtc, &ds1305_nvmem_cfg);
+	devm_rtc_nvmem_register(ds1305->rtc, &ds1305_nvmem_cfg);
 
 	/* Maybe set up alarm IRQ; be ready to handle it triggering right
 	 * away.  NOTE that we don't share this.  The signal is active low,
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index e359cbf7882b..216bc5d9b716 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -2016,7 +2016,7 @@ static int ds1307_probe(struct i2c_client *client,
 			.priv = ds1307,
 		};
 
-		rtc_nvmem_register(ds1307->rtc, &nvmem_cfg);
+		devm_rtc_nvmem_register(ds1307->rtc, &nvmem_cfg);
 	}
 
 	ds1307_hwmon_register(ds1307);
diff --git a/drivers/rtc/rtc-ds1343.c b/drivers/rtc/rtc-ds1343.c
index e7604e844cbd..ea663e24a34c 100644
--- a/drivers/rtc/rtc-ds1343.c
+++ b/drivers/rtc/rtc-ds1343.c
@@ -413,7 +413,7 @@ static int ds1343_probe(struct spi_device *spi)
 		return res;
 
 	nvmem_cfg.priv = priv;
-	rtc_nvmem_register(priv->rtc, &nvmem_cfg);
+	devm_rtc_nvmem_register(priv->rtc, &nvmem_cfg);
 
 	priv->irq = spi->irq;
 
diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index 33c483d759c8..d5f48216e851 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -470,7 +470,7 @@ static int ds1511_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	rtc_nvmem_register(pdata->rtc, &ds1511_nvmem_cfg);
+	devm_rtc_nvmem_register(pdata->rtc, &ds1511_nvmem_cfg);
 
 	/*
 	 * if the platform has an interrupt in mind for this device,
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index c6a5563504e5..2d2eb739d92b 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -309,7 +309,7 @@ static int ds1553_rtc_probe(struct platform_device *pdev)
 		}
 	}
 
-	if (rtc_nvmem_register(pdata->rtc, &nvmem_cfg))
+	if (devm_rtc_nvmem_register(pdata->rtc, &nvmem_cfg))
 		dev_err(&pdev->dev, "unable to register nvmem\n");
 
 	return 0;
diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c
index 9043c96e8845..bef588fce266 100644
--- a/drivers/rtc/rtc-ds1685.c
+++ b/drivers/rtc/rtc-ds1685.c
@@ -1317,7 +1317,7 @@ ds1685_rtc_probe(struct platform_device *pdev)
 		return ret;
 
 	nvmem_cfg.priv = rtc;
-	ret = rtc_nvmem_register(rtc_dev, &nvmem_cfg);
+	ret = devm_rtc_nvmem_register(rtc_dev, &nvmem_cfg);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 291bbed90ef8..29792a8cce97 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -195,7 +195,7 @@ static int ds1742_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	if (rtc_nvmem_register(rtc, &nvmem_cfg))
+	if (devm_rtc_nvmem_register(rtc, &nvmem_cfg))
 		dev_err(&pdev->dev, "Unable to register nvmem\n");
 
 	return 0;
diff --git a/drivers/rtc/rtc-ds3232.c b/drivers/rtc/rtc-ds3232.c
index 69c37ab64352..16b89035d135 100644
--- a/drivers/rtc/rtc-ds3232.c
+++ b/drivers/rtc/rtc-ds3232.c
@@ -518,7 +518,7 @@ static int ds3232_probe(struct device *dev, struct regmap *regmap, int irq,
 	if (IS_ERR(ds3232->rtc))
 		return PTR_ERR(ds3232->rtc);
 
-	ret = rtc_nvmem_register(ds3232->rtc, &nvmem_cfg);
+	ret = devm_rtc_nvmem_register(ds3232->rtc, &nvmem_cfg);
 	if(ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-isl12026.c b/drivers/rtc/rtc-isl12026.c
index 5b6b17fb6d62..fff8d8253669 100644
--- a/drivers/rtc/rtc-isl12026.c
+++ b/drivers/rtc/rtc-isl12026.c
@@ -465,7 +465,7 @@ static int isl12026_probe_new(struct i2c_client *client)
 
 	priv->rtc->ops = &isl12026_rtc_ops;
 	nvm_cfg.priv = priv;
-	ret = rtc_nvmem_register(priv->rtc, &nvm_cfg);
+	ret = devm_rtc_nvmem_register(priv->rtc, &nvm_cfg);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-isl1208.c b/drivers/rtc/rtc-isl1208.c
index ebb691fa48a6..08d778b10e9e 100644
--- a/drivers/rtc/rtc-isl1208.c
+++ b/drivers/rtc/rtc-isl1208.c
@@ -890,7 +890,7 @@ isl1208_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	if (rc)
 		return rc;
 
-	rc = rtc_nvmem_register(isl1208->rtc, &isl1208->nvmem_config);
+	rc = devm_rtc_nvmem_register(isl1208->rtc, &isl1208->nvmem_config);
 	if (rc)
 		return rc;
 
diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c
index ee1d8f0146fd..e966a66ab2d3 100644
--- a/drivers/rtc/rtc-m48t59.c
+++ b/drivers/rtc/rtc-m48t59.c
@@ -466,7 +466,7 @@ static int m48t59_rtc_probe(struct platform_device *pdev)
 	m48t59->rtc->ops = ops;
 
 	nvmem_cfg.size = pdata->offset;
-	ret = rtc_nvmem_register(m48t59->rtc, &nvmem_cfg);
+	ret = devm_rtc_nvmem_register(m48t59->rtc, &nvmem_cfg);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c
index 2b1135590dd5..182cfe59e4e0 100644
--- a/drivers/rtc/rtc-m48t86.c
+++ b/drivers/rtc/rtc-m48t86.c
@@ -259,7 +259,7 @@ static int m48t86_rtc_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	rtc_nvmem_register(info->rtc, &m48t86_nvmem_cfg);
+	devm_rtc_nvmem_register(info->rtc, &m48t86_nvmem_cfg);
 
 	/* read battery status */
 	reg = m48t86_readb(&pdev->dev, M48T86_D);
diff --git a/drivers/rtc/rtc-meson.c b/drivers/rtc/rtc-meson.c
index 47ebcf834cc2..938267713a4d 100644
--- a/drivers/rtc/rtc-meson.c
+++ b/drivers/rtc/rtc-meson.c
@@ -365,7 +365,7 @@ static int meson_rtc_probe(struct platform_device *pdev)
 	}
 
 	meson_rtc_nvmem_config.priv = rtc;
-	ret = rtc_nvmem_register(rtc->rtc, &meson_rtc_nvmem_config);
+	ret = devm_rtc_nvmem_register(rtc->rtc, &meson_rtc_nvmem_config);
 	if (ret)
 		goto out_disable_vdd;
 
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index 606fa80ad6e0..e65f79fc7718 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -890,7 +890,7 @@ static int omap_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		goto err;
 
-	rtc_nvmem_register(rtc->rtc, &omap_rtc_nvmem_config);
+	devm_rtc_nvmem_register(rtc->rtc, &omap_rtc_nvmem_config);
 
 	if (rtc->is_pmic_controller) {
 		if (!pm_power_off) {
diff --git a/drivers/rtc/rtc-pcf2127.c b/drivers/rtc/rtc-pcf2127.c
index fd46860152e1..432cd627359b 100644
--- a/drivers/rtc/rtc-pcf2127.c
+++ b/drivers/rtc/rtc-pcf2127.c
@@ -608,7 +608,7 @@ static int pcf2127_probe(struct device *dev, struct regmap *regmap,
 			.size = 512,
 		};
 
-		ret = rtc_nvmem_register(pcf2127->rtc, &nvmem_cfg);
+		ret = devm_rtc_nvmem_register(pcf2127->rtc, &nvmem_cfg);
 	}
 
 	/*
diff --git a/drivers/rtc/rtc-pcf85063.c b/drivers/rtc/rtc-pcf85063.c
index f8b99cb72959..c19f139e9b8d 100644
--- a/drivers/rtc/rtc-pcf85063.c
+++ b/drivers/rtc/rtc-pcf85063.c
@@ -607,7 +607,7 @@ static int pcf85063_probe(struct i2c_client *client)
 	}
 
 	nvmem_cfg.priv = pcf85063->regmap;
-	rtc_nvmem_register(pcf85063->rtc, &nvmem_cfg);
+	devm_rtc_nvmem_register(pcf85063->rtc, &nvmem_cfg);
 
 #ifdef CONFIG_COMMON_CLK
 	/* register clk in common clk framework */
diff --git a/drivers/rtc/rtc-pcf85363.c b/drivers/rtc/rtc-pcf85363.c
index 3450d615974d..23cf14ca2c96 100644
--- a/drivers/rtc/rtc-pcf85363.c
+++ b/drivers/rtc/rtc-pcf85363.c
@@ -422,7 +422,7 @@ static int pcf85363_probe(struct i2c_client *client,
 
 	for (i = 0; i < config->num_nvram; i++) {
 		nvmem_cfg[i].priv = pcf85363;
-		rtc_nvmem_register(pcf85363->rtc, &nvmem_cfg[i]);
+		devm_rtc_nvmem_register(pcf85363->rtc, &nvmem_cfg[i]);
 	}
 
 	return ret;
diff --git a/drivers/rtc/rtc-rp5c01.c b/drivers/rtc/rtc-rp5c01.c
index a69e8adcc4a1..8bc476c0905f 100644
--- a/drivers/rtc/rtc-rp5c01.c
+++ b/drivers/rtc/rtc-rp5c01.c
@@ -255,7 +255,7 @@ static int __init rp5c01_rtc_probe(struct platform_device *dev)
 	priv->rtc = rtc;
 
 	nvmem_cfg.priv = priv;
-	error = rtc_nvmem_register(rtc, &nvmem_cfg);
+	error = devm_rtc_nvmem_register(rtc, &nvmem_cfg);
 	if (error)
 		return error;
 
diff --git a/drivers/rtc/rtc-rv3028.c b/drivers/rtc/rtc-rv3028.c
index fa226f0fe67d..f788df979750 100644
--- a/drivers/rtc/rtc-rv3028.c
+++ b/drivers/rtc/rtc-rv3028.c
@@ -891,9 +891,9 @@ static int rv3028_probe(struct i2c_client *client)
 		return ret;
 
 	nvmem_cfg.priv = rv3028->regmap;
-	rtc_nvmem_register(rv3028->rtc, &nvmem_cfg);
+	devm_rtc_nvmem_register(rv3028->rtc, &nvmem_cfg);
 	eeprom_cfg.priv = rv3028;
-	rtc_nvmem_register(rv3028->rtc, &eeprom_cfg);
+	devm_rtc_nvmem_register(rv3028->rtc, &eeprom_cfg);
 
 	rv3028->rtc->max_user_freq = 1;
 
diff --git a/drivers/rtc/rtc-rv3029c2.c b/drivers/rtc/rtc-rv3029c2.c
index 62718231731b..ad359b3b74b2 100644
--- a/drivers/rtc/rtc-rv3029c2.c
+++ b/drivers/rtc/rtc-rv3029c2.c
@@ -755,7 +755,7 @@ static int rv3029_probe(struct device *dev, struct regmap *regmap, int irq,
 		return rc;
 
 	nvmem_cfg.priv = rv3029->regmap;
-	rtc_nvmem_register(rv3029->rtc, &nvmem_cfg);
+	devm_rtc_nvmem_register(rv3029->rtc, &nvmem_cfg);
 
 	return 0;
 }
diff --git a/drivers/rtc/rtc-rv3032.c b/drivers/rtc/rtc-rv3032.c
index 14e931d6f9c6..ed9cba3292e6 100644
--- a/drivers/rtc/rtc-rv3032.c
+++ b/drivers/rtc/rtc-rv3032.c
@@ -890,9 +890,9 @@ static int rv3032_probe(struct i2c_client *client)
 		return ret;
 
 	nvmem_cfg.priv = rv3032->regmap;
-	rtc_nvmem_register(rv3032->rtc, &nvmem_cfg);
+	devm_rtc_nvmem_register(rv3032->rtc, &nvmem_cfg);
 	eeprom_cfg.priv = rv3032;
-	rtc_nvmem_register(rv3032->rtc, &eeprom_cfg);
+	devm_rtc_nvmem_register(rv3032->rtc, &eeprom_cfg);
 
 	rv3032->rtc->max_user_freq = 1;
 
diff --git a/drivers/rtc/rtc-rv8803.c b/drivers/rtc/rtc-rv8803.c
index 1d888da48c7c..44e1818a751c 100644
--- a/drivers/rtc/rtc-rv8803.c
+++ b/drivers/rtc/rtc-rv8803.c
@@ -591,7 +591,7 @@ static int rv8803_probe(struct i2c_client *client,
 	if (err)
 		return err;
 
-	rtc_nvmem_register(rv8803->rtc, &nvmem_cfg);
+	devm_rtc_nvmem_register(rv8803->rtc, &nvmem_cfg);
 
 	rv8803->rtc->max_user_freq = 1;
 
diff --git a/drivers/rtc/rtc-rx8581.c b/drivers/rtc/rtc-rx8581.c
index 490f70f57636..017f74721cc0 100644
--- a/drivers/rtc/rtc-rx8581.c
+++ b/drivers/rtc/rtc-rx8581.c
@@ -302,7 +302,7 @@ static int rx8581_probe(struct i2c_client *client,
 
 	for (i = 0; i < config->num_nvram; i++) {
 		nvmem_cfg[i].priv = rx8581;
-		rtc_nvmem_register(rx8581->rtc, &nvmem_cfg[i]);
+		devm_rtc_nvmem_register(rx8581->rtc, &nvmem_cfg[i]);
 	}
 
 	return ret;
diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c
index 1ccf0d5d05b4..ad616bce7bca 100644
--- a/drivers/rtc/rtc-stk17ta8.c
+++ b/drivers/rtc/rtc-stk17ta8.c
@@ -313,7 +313,7 @@ static int stk17ta8_rtc_probe(struct platform_device *pdev)
 	pdata->rtc->ops = &stk17ta8_rtc_ops;
 
 	nvmem_cfg.priv = pdata;
-	ret = rtc_nvmem_register(pdata->rtc, &nvmem_cfg);
+	ret = devm_rtc_nvmem_register(pdata->rtc, &nvmem_cfg);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c
index abbb62b14d7a..11f46272bad3 100644
--- a/drivers/rtc/rtc-tx4939.c
+++ b/drivers/rtc/rtc-tx4939.c
@@ -271,7 +271,7 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev)
 	pdata->rtc = rtc;
 
 	nvmem_cfg.priv = pdata;
-	ret = rtc_nvmem_register(rtc, &nvmem_cfg);
+	ret = devm_rtc_nvmem_register(rtc, &nvmem_cfg);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 0983ab9faffb..cbca651d8ca4 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -244,11 +244,11 @@ extern int rtc_hctosys_ret;
 #endif
 
 #ifdef CONFIG_RTC_NVMEM
-int rtc_nvmem_register(struct rtc_device *rtc,
-		       struct nvmem_config *nvmem_config);
+int devm_rtc_nvmem_register(struct rtc_device *rtc,
+			    struct nvmem_config *nvmem_config);
 #else
-static inline int rtc_nvmem_register(struct rtc_device *rtc,
-				     struct nvmem_config *nvmem_config)
+static inline int devm_rtc_nvmem_register(struct rtc_device *rtc,
+					  struct nvmem_config *nvmem_config)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From fdcfd854333be5b30377dc5daa9cd0fa1643a979 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Date: Mon, 9 Nov 2020 17:34:08 +0100
Subject: rtc: rework rtc_register_device() resource management

rtc_register_device() is a managed interface but it doesn't use devres
by itself - instead it marks an rtc_device as "registered" and the devres
callback for devm_rtc_allocate_device() takes care of resource release.

This doesn't correspond with the design behind devres where managed
structures should not be aware of being managed. The correct solution
here is to register a separate devres callback for unregistering the
device.

While at it: rename rtc_register_device() to devm_rtc_register_device()
and add it to the list of managed interfaces in devres.rst. This way we
can avoid any potential confusion of driver developers who may expect
there to exist a corresponding unregister function.

Signed-off-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201109163409.24301-8-brgl@bgdev.pl
---
 Documentation/driver-api/driver-model/devres.rst |  1 +
 arch/alpha/kernel/rtc.c                          |  2 +-
 drivers/mfd/menelaus.c                           |  2 +-
 drivers/rtc/class.c                              | 19 +++++++++----------
 drivers/rtc/rtc-88pm80x.c                        |  2 +-
 drivers/rtc/rtc-88pm860x.c                       |  2 +-
 drivers/rtc/rtc-ab-b5ze-s3.c                     |  2 +-
 drivers/rtc/rtc-ab-eoz9.c                        |  2 +-
 drivers/rtc/rtc-ab3100.c                         |  2 +-
 drivers/rtc/rtc-ab8500.c                         |  2 +-
 drivers/rtc/rtc-abx80x.c                         |  2 +-
 drivers/rtc/rtc-ac100.c                          |  2 +-
 drivers/rtc/rtc-armada38x.c                      |  2 +-
 drivers/rtc/rtc-aspeed.c                         |  2 +-
 drivers/rtc/rtc-at91rm9200.c                     |  2 +-
 drivers/rtc/rtc-at91sam9.c                       |  2 +-
 drivers/rtc/rtc-au1xxx.c                         |  2 +-
 drivers/rtc/rtc-bd70528.c                        |  2 +-
 drivers/rtc/rtc-brcmstb-waketimer.c              |  2 +-
 drivers/rtc/rtc-cadence.c                        |  2 +-
 drivers/rtc/rtc-cmos.c                           |  2 +-
 drivers/rtc/rtc-coh901331.c                      |  2 +-
 drivers/rtc/rtc-cpcap.c                          |  2 +-
 drivers/rtc/rtc-cros-ec.c                        |  2 +-
 drivers/rtc/rtc-da9052.c                         |  2 +-
 drivers/rtc/rtc-da9063.c                         |  2 +-
 drivers/rtc/rtc-davinci.c                        |  2 +-
 drivers/rtc/rtc-digicolor.c                      |  2 +-
 drivers/rtc/rtc-dm355evm.c                       |  2 +-
 drivers/rtc/rtc-ds1305.c                         |  2 +-
 drivers/rtc/rtc-ds1307.c                         |  2 +-
 drivers/rtc/rtc-ds1343.c                         |  2 +-
 drivers/rtc/rtc-ds1347.c                         |  2 +-
 drivers/rtc/rtc-ds1374.c                         |  2 +-
 drivers/rtc/rtc-ds1511.c                         |  2 +-
 drivers/rtc/rtc-ds1553.c                         |  2 +-
 drivers/rtc/rtc-ds1672.c                         |  2 +-
 drivers/rtc/rtc-ds1685.c                         |  2 +-
 drivers/rtc/rtc-ds1742.c                         |  2 +-
 drivers/rtc/rtc-ds2404.c                         |  2 +-
 drivers/rtc/rtc-ep93xx.c                         |  2 +-
 drivers/rtc/rtc-fsl-ftm-alarm.c                  |  2 +-
 drivers/rtc/rtc-ftrtc010.c                       |  2 +-
 drivers/rtc/rtc-goldfish.c                       |  2 +-
 drivers/rtc/rtc-imx-sc.c                         |  2 +-
 drivers/rtc/rtc-imxdi.c                          |  2 +-
 drivers/rtc/rtc-isl12026.c                       |  2 +-
 drivers/rtc/rtc-isl1208.c                        |  2 +-
 drivers/rtc/rtc-jz4740.c                         |  2 +-
 drivers/rtc/rtc-lpc32xx.c                        |  2 +-
 drivers/rtc/rtc-ls1x.c                           |  2 +-
 drivers/rtc/rtc-m41t80.c                         |  2 +-
 drivers/rtc/rtc-m48t59.c                         |  2 +-
 drivers/rtc/rtc-m48t86.c                         |  2 +-
 drivers/rtc/rtc-mc13xxx.c                        |  2 +-
 drivers/rtc/rtc-meson-vrtc.c                     |  2 +-
 drivers/rtc/rtc-meson.c                          |  2 +-
 drivers/rtc/rtc-mpc5121.c                        |  2 +-
 drivers/rtc/rtc-mrst.c                           |  2 +-
 drivers/rtc/rtc-mt2712.c                         |  2 +-
 drivers/rtc/rtc-mt6397.c                         |  2 +-
 drivers/rtc/rtc-mv.c                             |  2 +-
 drivers/rtc/rtc-mxc.c                            |  2 +-
 drivers/rtc/rtc-mxc_v2.c                         |  2 +-
 drivers/rtc/rtc-omap.c                           |  2 +-
 drivers/rtc/rtc-pcap.c                           |  2 +-
 drivers/rtc/rtc-pcf2123.c                        |  2 +-
 drivers/rtc/rtc-pcf2127.c                        |  2 +-
 drivers/rtc/rtc-pcf85063.c                       |  2 +-
 drivers/rtc/rtc-pcf85363.c                       |  2 +-
 drivers/rtc/rtc-pcf8563.c                        |  2 +-
 drivers/rtc/rtc-pic32.c                          |  2 +-
 drivers/rtc/rtc-pl030.c                          |  2 +-
 drivers/rtc/rtc-pl031.c                          |  2 +-
 drivers/rtc/rtc-pm8xxx.c                         |  2 +-
 drivers/rtc/rtc-ps3.c                            |  2 +-
 drivers/rtc/rtc-r9701.c                          |  2 +-
 drivers/rtc/rtc-rc5t619.c                        |  2 +-
 drivers/rtc/rtc-rk808.c                          |  2 +-
 drivers/rtc/rtc-rp5c01.c                         |  2 +-
 drivers/rtc/rtc-rs5c348.c                        |  2 +-
 drivers/rtc/rtc-rv3028.c                         |  2 +-
 drivers/rtc/rtc-rv3029c2.c                       |  2 +-
 drivers/rtc/rtc-rv3032.c                         |  2 +-
 drivers/rtc/rtc-rv8803.c                         |  2 +-
 drivers/rtc/rtc-rx8010.c                         |  2 +-
 drivers/rtc/rtc-rx8581.c                         |  2 +-
 drivers/rtc/rtc-s35390a.c                        |  2 +-
 drivers/rtc/rtc-sa1100.c                         |  2 +-
 drivers/rtc/rtc-sc27xx.c                         |  2 +-
 drivers/rtc/rtc-sd3078.c                         |  2 +-
 drivers/rtc/rtc-sh.c                             |  2 +-
 drivers/rtc/rtc-sirfsoc.c                        |  2 +-
 drivers/rtc/rtc-snvs.c                           |  2 +-
 drivers/rtc/rtc-st-lpc.c                         |  2 +-
 drivers/rtc/rtc-starfire.c                       |  2 +-
 drivers/rtc/rtc-stk17ta8.c                       |  2 +-
 drivers/rtc/rtc-stmp3xxx.c                       |  2 +-
 drivers/rtc/rtc-sun4v.c                          |  2 +-
 drivers/rtc/rtc-sun6i.c                          |  2 +-
 drivers/rtc/rtc-sunxi.c                          |  2 +-
 drivers/rtc/rtc-tegra.c                          |  2 +-
 drivers/rtc/rtc-test.c                           |  2 +-
 drivers/rtc/rtc-tps6586x.c                       |  2 +-
 drivers/rtc/rtc-tps65910.c                       |  2 +-
 drivers/rtc/rtc-tx4939.c                         |  2 +-
 drivers/rtc/rtc-vr41xx.c                         |  2 +-
 drivers/rtc/rtc-vt8500.c                         |  2 +-
 drivers/rtc/rtc-wilco-ec.c                       |  2 +-
 drivers/rtc/rtc-wm831x.c                         |  2 +-
 drivers/rtc/rtc-xgene.c                          |  2 +-
 drivers/rtc/rtc-zynqmp.c                         |  2 +-
 drivers/rtc/sysfs.c                              |  2 --
 include/linux/rtc.h                              |  8 +++-----
 114 files changed, 123 insertions(+), 127 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 5df7ba54a4ba..cd8b6e657b94 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -414,6 +414,7 @@ RESET
 RTC
   devm_rtc_device_register()
   devm_rtc_allocate_device()
+  devm_rtc_register_device()
   devm_rtc_nvmem_register()
 
 SERDEV
diff --git a/arch/alpha/kernel/rtc.c b/arch/alpha/kernel/rtc.c
index 1b1d5963ac55..ce3077946e1d 100644
--- a/arch/alpha/kernel/rtc.c
+++ b/arch/alpha/kernel/rtc.c
@@ -216,6 +216,6 @@ alpha_rtc_init(void)
 		rtc->ops = &remote_rtc_ops;
 #endif
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 device_initcall(alpha_rtc_init);
diff --git a/drivers/mfd/menelaus.c b/drivers/mfd/menelaus.c
index b64d3315a5e1..07e0ca2e467c 100644
--- a/drivers/mfd/menelaus.c
+++ b/drivers/mfd/menelaus.c
@@ -1119,7 +1119,7 @@ static inline void menelaus_rtc_init(struct menelaus_chip *m)
 		menelaus_write_reg(MENELAUS_RTC_CTRL, m->rtc_control);
 	}
 
-	err = rtc_register_device(m->rtc);
+	err = devm_rtc_register_device(m->rtc);
 	if (err) {
 		if (alarm) {
 			menelaus_remove_irq_work(MENELAUS_RTCALM_IRQ);
diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index a99b7d24b77c..b8a34ee039ad 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -321,8 +321,10 @@ static void rtc_device_get_offset(struct rtc_device *rtc)
  *
  * @rtc: the RTC class device to destroy
  */
-static void rtc_device_unregister(struct rtc_device *rtc)
+static void devm_rtc_unregister_device(void *data)
 {
+	struct rtc_device *rtc = data;
+
 	mutex_lock(&rtc->ops_lock);
 	/*
 	 * Remove innards of this RTC, then disable it, before
@@ -339,10 +341,7 @@ static void devm_rtc_release_device(struct device *dev, void *res)
 {
 	struct rtc_device *rtc = *(struct rtc_device **)res;
 
-	if (rtc->registered)
-		rtc_device_unregister(rtc);
-	else
-		put_device(&rtc->dev);
+	put_device(&rtc->dev);
 }
 
 struct rtc_device *devm_rtc_allocate_device(struct device *dev)
@@ -383,7 +382,7 @@ exit_ida:
 }
 EXPORT_SYMBOL_GPL(devm_rtc_allocate_device);
 
-int __rtc_register_device(struct module *owner, struct rtc_device *rtc)
+int __devm_rtc_register_device(struct module *owner, struct rtc_device *rtc)
 {
 	struct rtc_wkalrm alrm;
 	int err;
@@ -413,7 +412,6 @@ int __rtc_register_device(struct module *owner, struct rtc_device *rtc)
 
 	rtc_proc_add_device(rtc);
 
-	rtc->registered = true;
 	dev_info(rtc->dev.parent, "registered as %s\n",
 		 dev_name(&rtc->dev));
 
@@ -422,9 +420,10 @@ int __rtc_register_device(struct module *owner, struct rtc_device *rtc)
 		rtc_hctosys(rtc);
 #endif
 
-	return 0;
+	return devm_add_action_or_reset(rtc->dev.parent,
+					devm_rtc_unregister_device, rtc);
 }
-EXPORT_SYMBOL_GPL(__rtc_register_device);
+EXPORT_SYMBOL_GPL(__devm_rtc_register_device);
 
 /**
  * devm_rtc_device_register - resource managed rtc_device_register()
@@ -454,7 +453,7 @@ struct rtc_device *devm_rtc_device_register(struct device *dev,
 
 	rtc->ops = ops;
 
-	err = __rtc_register_device(owner, rtc);
+	err = __devm_rtc_register_device(owner, rtc);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/drivers/rtc/rtc-88pm80x.c b/drivers/rtc/rtc-88pm80x.c
index 75779e8501a3..6a3f44cf6ebe 100644
--- a/drivers/rtc/rtc-88pm80x.c
+++ b/drivers/rtc/rtc-88pm80x.c
@@ -294,7 +294,7 @@ static int pm80x_rtc_probe(struct platform_device *pdev)
 	info->rtc_dev->ops = &pm80x_rtc_ops;
 	info->rtc_dev->range_max = U32_MAX;
 
-	ret = rtc_register_device(info->rtc_dev);
+	ret = devm_rtc_register_device(info->rtc_dev);
 	if (ret)
 		goto out_rtc;
 
diff --git a/drivers/rtc/rtc-88pm860x.c b/drivers/rtc/rtc-88pm860x.c
index c90457d001e9..2c809a1a445e 100644
--- a/drivers/rtc/rtc-88pm860x.c
+++ b/drivers/rtc/rtc-88pm860x.c
@@ -307,7 +307,7 @@ static int pm860x_rtc_probe(struct platform_device *pdev)
 	info->rtc_dev->ops = &pm860x_rtc_ops;
 	info->rtc_dev->range_max = U32_MAX;
 
-	ret = rtc_register_device(info->rtc_dev);
+	ret = devm_rtc_register_device(info->rtc_dev);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ab-b5ze-s3.c b/drivers/rtc/rtc-ab-b5ze-s3.c
index 2370ac0cdb5f..6e3e320dc727 100644
--- a/drivers/rtc/rtc-ab-b5ze-s3.c
+++ b/drivers/rtc/rtc-ab-b5ze-s3.c
@@ -892,7 +892,7 @@ static int abb5zes3_probe(struct i2c_client *client,
 		}
 	}
 
-	ret = rtc_register_device(data->rtc);
+	ret = devm_rtc_register_device(data->rtc);
 
 err:
 	if (ret && data->irq)
diff --git a/drivers/rtc/rtc-ab-eoz9.c b/drivers/rtc/rtc-ab-eoz9.c
index d690985caa4c..b20d8f26dcdb 100644
--- a/drivers/rtc/rtc-ab-eoz9.c
+++ b/drivers/rtc/rtc-ab-eoz9.c
@@ -420,7 +420,7 @@ static int abeoz9_probe(struct i2c_client *client,
 	data->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	data->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	ret = rtc_register_device(data->rtc);
+	ret = devm_rtc_register_device(data->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ab3100.c b/drivers/rtc/rtc-ab3100.c
index 2ed6def90975..e4fd961e8bf6 100644
--- a/drivers/rtc/rtc-ab3100.c
+++ b/drivers/rtc/rtc-ab3100.c
@@ -238,7 +238,7 @@ static int __init ab3100_rtc_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, rtc);
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct platform_driver ab3100_rtc_driver = {
diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c
index 3d60f3283f11..b40048871295 100644
--- a/drivers/rtc/rtc-ab8500.c
+++ b/drivers/rtc/rtc-ab8500.c
@@ -404,7 +404,7 @@ static int ab8500_rtc_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static int ab8500_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-abx80x.c b/drivers/rtc/rtc-abx80x.c
index 803725b3a02c..6733bb0df674 100644
--- a/drivers/rtc/rtc-abx80x.c
+++ b/drivers/rtc/rtc-abx80x.c
@@ -851,7 +851,7 @@ static int abx80x_probe(struct i2c_client *client,
 		return err;
 	}
 
-	return rtc_register_device(priv->rtc);
+	return devm_rtc_register_device(priv->rtc);
 }
 
 static const struct i2c_device_id abx80x_id[] = {
diff --git a/drivers/rtc/rtc-ac100.c b/drivers/rtc/rtc-ac100.c
index 29223931aba7..1ddbef99e38f 100644
--- a/drivers/rtc/rtc-ac100.c
+++ b/drivers/rtc/rtc-ac100.c
@@ -610,7 +610,7 @@ static int ac100_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return rtc_register_device(chip->rtc);
+	return devm_rtc_register_device(chip->rtc);
 }
 
 static int ac100_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-armada38x.c b/drivers/rtc/rtc-armada38x.c
index 94d7c22fc4f3..807a79c07f08 100644
--- a/drivers/rtc/rtc-armada38x.c
+++ b/drivers/rtc/rtc-armada38x.c
@@ -556,7 +556,7 @@ static __init int armada38x_rtc_probe(struct platform_device *pdev)
 
 	rtc->rtc_dev->range_max = U32_MAX;
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/rtc/rtc-aspeed.c b/drivers/rtc/rtc-aspeed.c
index eacdd0637cce..a93352ed3aec 100644
--- a/drivers/rtc/rtc-aspeed.c
+++ b/drivers/rtc/rtc-aspeed.c
@@ -104,7 +104,7 @@ static int aspeed_rtc_probe(struct platform_device *pdev)
 	rtc->rtc_dev->range_min = RTC_TIMESTAMP_BEGIN_1900;
 	rtc->rtc_dev->range_max = 38814989399LL; /* 3199-12-31 23:59:59 */
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 static const struct of_device_id aspeed_rtc_match[] = {
diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c
index da24e68adcca..fe396d27ebb7 100644
--- a/drivers/rtc/rtc-at91rm9200.c
+++ b/drivers/rtc/rtc-at91rm9200.c
@@ -538,7 +538,7 @@ static int __init at91_rtc_probe(struct platform_device *pdev)
 
 	rtc->range_min = RTC_TIMESTAMP_BEGIN_1900;
 	rtc->range_max = RTC_TIMESTAMP_END_2099;
-	ret = rtc_register_device(rtc);
+	ret = devm_rtc_register_device(rtc);
 	if (ret)
 		goto err_clk;
 
diff --git a/drivers/rtc/rtc-at91sam9.c b/drivers/rtc/rtc-at91sam9.c
index e39e89867d29..2216be429ab7 100644
--- a/drivers/rtc/rtc-at91sam9.c
+++ b/drivers/rtc/rtc-at91sam9.c
@@ -431,7 +431,7 @@ static int at91_rtc_probe(struct platform_device *pdev)
 		dev_warn(&pdev->dev, "%s: SET TIME!\n",
 			 dev_name(&rtc->rtcdev->dev));
 
-	return rtc_register_device(rtc->rtcdev);
+	return devm_rtc_register_device(rtc->rtcdev);
 
 err_clk:
 	clk_disable_unprepare(rtc->sclk);
diff --git a/drivers/rtc/rtc-au1xxx.c b/drivers/rtc/rtc-au1xxx.c
index 791bebcb6f47..e6428b27b5d4 100644
--- a/drivers/rtc/rtc-au1xxx.c
+++ b/drivers/rtc/rtc-au1xxx.c
@@ -104,7 +104,7 @@ static int au1xtoy_rtc_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, rtcdev);
 
-	return rtc_register_device(rtcdev);
+	return devm_rtc_register_device(rtcdev);
 }
 
 static struct platform_driver au1xrtc_driver = {
diff --git a/drivers/rtc/rtc-bd70528.c b/drivers/rtc/rtc-bd70528.c
index 4492b770422c..17cb67f5bf6e 100644
--- a/drivers/rtc/rtc-bd70528.c
+++ b/drivers/rtc/rtc-bd70528.c
@@ -604,7 +604,7 @@ static int bd70528_probe(struct platform_device *pdev)
 		}
 	}
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static const struct platform_device_id bd718x7_rtc_id[] = {
diff --git a/drivers/rtc/rtc-brcmstb-waketimer.c b/drivers/rtc/rtc-brcmstb-waketimer.c
index 375a9987a1d6..0366e2ff04ae 100644
--- a/drivers/rtc/rtc-brcmstb-waketimer.c
+++ b/drivers/rtc/rtc-brcmstb-waketimer.c
@@ -252,7 +252,7 @@ static int brcmstb_waketmr_probe(struct platform_device *pdev)
 	timer->rtc->ops = &brcmstb_waketmr_ops;
 	timer->rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(timer->rtc);
+	ret = devm_rtc_register_device(timer->rtc);
 	if (ret)
 		goto err_notifier;
 
diff --git a/drivers/rtc/rtc-cadence.c b/drivers/rtc/rtc-cadence.c
index 595d5d252850..1edf7f16d73a 100644
--- a/drivers/rtc/rtc-cadence.c
+++ b/drivers/rtc/rtc-cadence.c
@@ -336,7 +336,7 @@ static int cdns_rtc_probe(struct platform_device *pdev)
 	writel(0, crtc->regs + CDNS_RTC_HMR);
 	writel(CDNS_RTC_KRTCR_KRTC, crtc->regs + CDNS_RTC_KRTCR);
 
-	ret = rtc_register_device(crtc->rtc_dev);
+	ret = devm_rtc_register_device(crtc->rtc_dev);
 	if (ret)
 		goto err_disable_wakeup;
 
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index 766074c04b53..83415600185c 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -863,7 +863,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
 		cmos_rtc.rtc->ops = &cmos_rtc_ops_no_alarm;
 	}
 
-	retval = rtc_register_device(cmos_rtc.rtc);
+	retval = devm_rtc_register_device(cmos_rtc.rtc);
 	if (retval)
 		goto cleanup2;
 
diff --git a/drivers/rtc/rtc-coh901331.c b/drivers/rtc/rtc-coh901331.c
index da59917c9ee8..168ced87d93a 100644
--- a/drivers/rtc/rtc-coh901331.c
+++ b/drivers/rtc/rtc-coh901331.c
@@ -203,7 +203,7 @@ static int __init coh901331_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, rtap);
 
-	ret = rtc_register_device(rtap->rtc);
+	ret = devm_rtc_register_device(rtap->rtc);
 	if (ret)
 		goto out_no_rtc;
 
diff --git a/drivers/rtc/rtc-cpcap.c b/drivers/rtc/rtc-cpcap.c
index 38d576b0c4fa..afc8fcba8f88 100644
--- a/drivers/rtc/rtc-cpcap.c
+++ b/drivers/rtc/rtc-cpcap.c
@@ -301,7 +301,7 @@ static int cpcap_rtc_probe(struct platform_device *pdev)
 		/* ignore error and continue without wakeup support */
 	}
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 static const struct of_device_id cpcap_rtc_of_match[] = {
diff --git a/drivers/rtc/rtc-cros-ec.c b/drivers/rtc/rtc-cros-ec.c
index f7343c289cab..70626793ca69 100644
--- a/drivers/rtc/rtc-cros-ec.c
+++ b/drivers/rtc/rtc-cros-ec.c
@@ -350,7 +350,7 @@ static int cros_ec_rtc_probe(struct platform_device *pdev)
 	cros_ec_rtc->rtc->ops = &cros_ec_rtc_ops;
 	cros_ec_rtc->rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(cros_ec_rtc->rtc);
+	ret = devm_rtc_register_device(cros_ec_rtc->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-da9052.c b/drivers/rtc/rtc-da9052.c
index 58de10da37b1..9ca99bd35702 100644
--- a/drivers/rtc/rtc-da9052.c
+++ b/drivers/rtc/rtc-da9052.c
@@ -304,7 +304,7 @@ static int da9052_rtc_probe(struct platform_device *pdev)
 	rtc->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rtc->rtc->range_max = RTC_TIMESTAMP_END_2063;
 
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-da9063.c b/drivers/rtc/rtc-da9063.c
index 6f0a3a711135..d4b72a9fa2ba 100644
--- a/drivers/rtc/rtc-da9063.c
+++ b/drivers/rtc/rtc-da9063.c
@@ -494,7 +494,7 @@ static int da9063_rtc_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "Failed to request ALARM IRQ %d: %d\n",
 			irq_alarm, ret);
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 static struct platform_driver da9063_rtc_driver = {
diff --git a/drivers/rtc/rtc-davinci.c b/drivers/rtc/rtc-davinci.c
index 73f87a17cdf3..6bef0f2353da 100644
--- a/drivers/rtc/rtc-davinci.c
+++ b/drivers/rtc/rtc-davinci.c
@@ -484,7 +484,7 @@ static int __init davinci_rtc_probe(struct platform_device *pdev)
 
 	device_init_wakeup(&pdev->dev, 0);
 
-	return rtc_register_device(davinci_rtc->rtc);
+	return devm_rtc_register_device(davinci_rtc->rtc);
 }
 
 static int __exit davinci_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-digicolor.c b/drivers/rtc/rtc-digicolor.c
index 200d85b01e8b..4fdfa5b6feb2 100644
--- a/drivers/rtc/rtc-digicolor.c
+++ b/drivers/rtc/rtc-digicolor.c
@@ -202,7 +202,7 @@ static int __init dc_rtc_probe(struct platform_device *pdev)
 	rtc->rtc_dev->ops = &dc_rtc_ops;
 	rtc->rtc_dev->range_max = U32_MAX;
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 static const struct of_device_id dc_dt_ids[] = {
diff --git a/drivers/rtc/rtc-dm355evm.c b/drivers/rtc/rtc-dm355evm.c
index cd947a20843b..94fb16ac3e0f 100644
--- a/drivers/rtc/rtc-dm355evm.c
+++ b/drivers/rtc/rtc-dm355evm.c
@@ -132,7 +132,7 @@ static int dm355evm_rtc_probe(struct platform_device *pdev)
 	rtc->ops = &dm355evm_rtc_ops;
 	rtc->range_max = U32_MAX;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 /*
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c
index a4e768261b43..8c2ab29c3d91 100644
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -694,7 +694,7 @@ static int ds1305_probe(struct spi_device *spi)
 	ds1305->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
 	ds1305_nvmem_cfg.priv = ds1305;
-	status = rtc_register_device(ds1305->rtc);
+	status = devm_rtc_register_device(ds1305->rtc);
 	if (status)
 		return status;
 
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index 216bc5d9b716..183cf7c01364 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -2001,7 +2001,7 @@ static int ds1307_probe(struct i2c_client *client,
 	if (err)
 		return err;
 
-	err = rtc_register_device(ds1307->rtc);
+	err = devm_rtc_register_device(ds1307->rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-ds1343.c b/drivers/rtc/rtc-ds1343.c
index ea663e24a34c..f14ed6c96437 100644
--- a/drivers/rtc/rtc-ds1343.c
+++ b/drivers/rtc/rtc-ds1343.c
@@ -408,7 +408,7 @@ static int ds1343_probe(struct spi_device *spi)
 		dev_err(&spi->dev,
 			"unable to create sysfs entries for rtc ds1343\n");
 
-	res = rtc_register_device(priv->rtc);
+	res = devm_rtc_register_device(priv->rtc);
 	if (res)
 		return res;
 
diff --git a/drivers/rtc/rtc-ds1347.c b/drivers/rtc/rtc-ds1347.c
index 7025cf3fb9f8..157bf5209ac4 100644
--- a/drivers/rtc/rtc-ds1347.c
+++ b/drivers/rtc/rtc-ds1347.c
@@ -166,7 +166,7 @@ static int ds1347_probe(struct spi_device *spi)
 	rtc->range_min = RTC_TIMESTAMP_BEGIN_0000;
 	rtc->range_max = RTC_TIMESTAMP_END_9999;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct spi_driver ds1347_driver = {
diff --git a/drivers/rtc/rtc-ds1374.c b/drivers/rtc/rtc-ds1374.c
index 177d870bda0d..fab79921a712 100644
--- a/drivers/rtc/rtc-ds1374.c
+++ b/drivers/rtc/rtc-ds1374.c
@@ -508,7 +508,7 @@ static int ds1374_probe(struct i2c_client *client,
 	ds1374->rtc->ops = &ds1374_rtc_ops;
 	ds1374->rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(ds1374->rtc);
+	ret = devm_rtc_register_device(ds1374->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index d5f48216e851..bda884333082 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -466,7 +466,7 @@ static int ds1511_rtc_probe(struct platform_device *pdev)
 
 	pdata->rtc->ops = &ds1511_rtc_ops;
 
-	ret = rtc_register_device(pdata->rtc);
+	ret = devm_rtc_register_device(pdata->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index bb40ea8b6373..dbff5b621ef5 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -295,7 +295,7 @@ static int ds1553_rtc_probe(struct platform_device *pdev)
 
 	pdata->rtc->ops = &ds1553_rtc_ops;
 
-	ret = rtc_register_device(pdata->rtc);
+	ret = devm_rtc_register_device(pdata->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ds1672.c b/drivers/rtc/rtc-ds1672.c
index 9da84df9f152..630493759d15 100644
--- a/drivers/rtc/rtc-ds1672.c
+++ b/drivers/rtc/rtc-ds1672.c
@@ -124,7 +124,7 @@ static int ds1672_probe(struct i2c_client *client,
 	rtc->ops = &ds1672_rtc_ops;
 	rtc->range_max = U32_MAX;
 
-	err = rtc_register_device(rtc);
+	err = devm_rtc_register_device(rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-ds1685.c b/drivers/rtc/rtc-ds1685.c
index bef588fce266..d69c807af29b 100644
--- a/drivers/rtc/rtc-ds1685.c
+++ b/drivers/rtc/rtc-ds1685.c
@@ -1321,7 +1321,7 @@ ds1685_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return rtc_register_device(rtc_dev);
+	return devm_rtc_register_device(rtc_dev);
 }
 
 /**
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 39c6c3a85b34..13d45c697da6 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -191,7 +191,7 @@ static int ds1742_rtc_probe(struct platform_device *pdev)
 
 	rtc->ops = &ds1742_rtc_ops;
 
-	ret = rtc_register_device(rtc);
+	ret = devm_rtc_register_device(rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-ds2404.c b/drivers/rtc/rtc-ds2404.c
index 9df0c44512b8..0480f592307e 100644
--- a/drivers/rtc/rtc-ds2404.c
+++ b/drivers/rtc/rtc-ds2404.c
@@ -234,7 +234,7 @@ static int rtc_probe(struct platform_device *pdev)
 	chip->rtc->ops = &ds2404_rtc_ops;
 	chip->rtc->range_max = U32_MAX;
 
-	retval = rtc_register_device(chip->rtc);
+	retval = devm_rtc_register_device(chip->rtc);
 	if (retval)
 		return retval;
 
diff --git a/drivers/rtc/rtc-ep93xx.c b/drivers/rtc/rtc-ep93xx.c
index 8ec9ea1ca72e..9a5a15cbcd9b 100644
--- a/drivers/rtc/rtc-ep93xx.c
+++ b/drivers/rtc/rtc-ep93xx.c
@@ -145,7 +145,7 @@ static int ep93xx_rtc_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	return rtc_register_device(ep93xx_rtc->rtc);
+	return devm_rtc_register_device(ep93xx_rtc->rtc);
 }
 
 static struct platform_driver ep93xx_rtc_driver = {
diff --git a/drivers/rtc/rtc-fsl-ftm-alarm.c b/drivers/rtc/rtc-fsl-ftm-alarm.c
index 48d3b38ea348..57cc09d0a806 100644
--- a/drivers/rtc/rtc-fsl-ftm-alarm.c
+++ b/drivers/rtc/rtc-fsl-ftm-alarm.c
@@ -290,7 +290,7 @@ static int ftm_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		dev_err(&pdev->dev, "failed to enable irq wake\n");
 
-	ret = rtc_register_device(rtc->rtc_dev);
+	ret = devm_rtc_register_device(rtc->rtc_dev);
 	if (ret) {
 		dev_err(&pdev->dev, "can't register rtc device\n");
 		return ret;
diff --git a/drivers/rtc/rtc-ftrtc010.c b/drivers/rtc/rtc-ftrtc010.c
index 0919f7dc94a3..ad3add5db4c8 100644
--- a/drivers/rtc/rtc-ftrtc010.c
+++ b/drivers/rtc/rtc-ftrtc010.c
@@ -176,7 +176,7 @@ static int ftrtc010_rtc_probe(struct platform_device *pdev)
 	if (unlikely(ret))
 		return ret;
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 static int ftrtc010_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-goldfish.c b/drivers/rtc/rtc-goldfish.c
index 6349d2cd3680..7ab95d052644 100644
--- a/drivers/rtc/rtc-goldfish.c
+++ b/drivers/rtc/rtc-goldfish.c
@@ -194,7 +194,7 @@ static int goldfish_rtc_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	return rtc_register_device(rtcdrv->rtc);
+	return devm_rtc_register_device(rtcdrv->rtc);
 }
 
 static const struct of_device_id goldfish_rtc_of_match[] = {
diff --git a/drivers/rtc/rtc-imx-sc.c b/drivers/rtc/rtc-imx-sc.c
index a5f59e6f862e..cc9fbab49999 100644
--- a/drivers/rtc/rtc-imx-sc.c
+++ b/drivers/rtc/rtc-imx-sc.c
@@ -166,7 +166,7 @@ static int imx_sc_rtc_probe(struct platform_device *pdev)
 	imx_sc_rtc->range_min = 0;
 	imx_sc_rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(imx_sc_rtc);
+	ret = devm_rtc_register_device(imx_sc_rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c
index 8d141d8a5490..c2692da74e09 100644
--- a/drivers/rtc/rtc-imxdi.c
+++ b/drivers/rtc/rtc-imxdi.c
@@ -814,7 +814,7 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
 	imxdi->rtc->ops = &dryice_rtc_ops;
 	imxdi->rtc->range_max = U32_MAX;
 
-	rc = rtc_register_device(imxdi->rtc);
+	rc = devm_rtc_register_device(imxdi->rtc);
 	if (rc)
 		goto err;
 
diff --git a/drivers/rtc/rtc-isl12026.c b/drivers/rtc/rtc-isl12026.c
index fff8d8253669..1fc6627d854d 100644
--- a/drivers/rtc/rtc-isl12026.c
+++ b/drivers/rtc/rtc-isl12026.c
@@ -469,7 +469,7 @@ static int isl12026_probe_new(struct i2c_client *client)
 	if (ret)
 		return ret;
 
-	return rtc_register_device(priv->rtc);
+	return devm_rtc_register_device(priv->rtc);
 }
 
 static int isl12026_remove(struct i2c_client *client)
diff --git a/drivers/rtc/rtc-isl1208.c b/drivers/rtc/rtc-isl1208.c
index 08d778b10e9e..563a6d9c9fcf 100644
--- a/drivers/rtc/rtc-isl1208.c
+++ b/drivers/rtc/rtc-isl1208.c
@@ -894,7 +894,7 @@ isl1208_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	if (rc)
 		return rc;
 
-	return rtc_register_device(isl1208->rtc);
+	return devm_rtc_register_device(isl1208->rtc);
 }
 
 static struct i2c_driver isl1208_driver = {
diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c
index 9607e6b6e0b3..6e51df72fd65 100644
--- a/drivers/rtc/rtc-jz4740.c
+++ b/drivers/rtc/rtc-jz4740.c
@@ -375,7 +375,7 @@ static int jz4740_rtc_probe(struct platform_device *pdev)
 	/* Each 1 Hz pulse should happen after (rate) ticks */
 	jz4740_rtc_reg_write(rtc, JZ_REG_RTC_REGULATOR, rate - 1);
 
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c
index 15d8abda81fe..76ad7031a13d 100644
--- a/drivers/rtc/rtc-lpc32xx.c
+++ b/drivers/rtc/rtc-lpc32xx.c
@@ -239,7 +239,7 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev)
 	rtc->rtc->ops = &lpc32xx_rtc_ops;
 	rtc->rtc->range_max = U32_MAX;
 
-	err = rtc_register_device(rtc->rtc);
+	err = devm_rtc_register_device(rtc->rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-ls1x.c b/drivers/rtc/rtc-ls1x.c
index 8bd34056fea0..5af26dc5c2a3 100644
--- a/drivers/rtc/rtc-ls1x.c
+++ b/drivers/rtc/rtc-ls1x.c
@@ -176,7 +176,7 @@ static int ls1x_rtc_probe(struct platform_device *pdev)
 	rtcdev->range_min = RTC_TIMESTAMP_BEGIN_1900;
 	rtcdev->range_max = RTC_TIMESTAMP_END_2099;
 
-	return rtc_register_device(rtcdev);
+	return devm_rtc_register_device(rtcdev);
 }
 
 static struct platform_driver  ls1x_rtc_driver = {
diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c
index 8a89bc52b0d4..160dcf68e64e 100644
--- a/drivers/rtc/rtc-m41t80.c
+++ b/drivers/rtc/rtc-m41t80.c
@@ -977,7 +977,7 @@ static int m41t80_probe(struct i2c_client *client,
 		m41t80_sqw_register_clk(m41t80_data);
 #endif
 
-	rc = rtc_register_device(m41t80_data->rtc);
+	rc = devm_rtc_register_device(m41t80_data->rtc);
 	if (rc)
 		return rc;
 
diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c
index e966a66ab2d3..5f5898d3b055 100644
--- a/drivers/rtc/rtc-m48t59.c
+++ b/drivers/rtc/rtc-m48t59.c
@@ -470,7 +470,7 @@ static int m48t59_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	ret = rtc_register_device(m48t59->rtc);
+	ret = devm_rtc_register_device(m48t59->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c
index 182cfe59e4e0..481c9525b1dd 100644
--- a/drivers/rtc/rtc-m48t86.c
+++ b/drivers/rtc/rtc-m48t86.c
@@ -255,7 +255,7 @@ static int m48t86_rtc_probe(struct platform_device *pdev)
 
 	info->rtc->ops = &m48t86_rtc_ops;
 
-	err = rtc_register_device(info->rtc);
+	err = devm_rtc_register_device(info->rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c
index d6802e6191cb..d4234e78497e 100644
--- a/drivers/rtc/rtc-mc13xxx.c
+++ b/drivers/rtc/rtc-mc13xxx.c
@@ -307,7 +307,7 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev)
 
 	mc13xxx_unlock(mc13xxx);
 
-	ret = rtc_register_device(priv->rtc);
+	ret = devm_rtc_register_device(priv->rtc);
 	if (ret) {
 		mc13xxx_lock(mc13xxx);
 		goto err_irq_request;
diff --git a/drivers/rtc/rtc-meson-vrtc.c b/drivers/rtc/rtc-meson-vrtc.c
index e6bd0808a092..1463c8621561 100644
--- a/drivers/rtc/rtc-meson-vrtc.c
+++ b/drivers/rtc/rtc-meson-vrtc.c
@@ -83,7 +83,7 @@ static int meson_vrtc_probe(struct platform_device *pdev)
 		return PTR_ERR(vrtc->rtc);
 
 	vrtc->rtc->ops = &meson_vrtc_ops;
-	return rtc_register_device(vrtc->rtc);
+	return devm_rtc_register_device(vrtc->rtc);
 }
 
 static int __maybe_unused meson_vrtc_suspend(struct device *dev)
diff --git a/drivers/rtc/rtc-meson.c b/drivers/rtc/rtc-meson.c
index 938267713a4d..8642c06565ea 100644
--- a/drivers/rtc/rtc-meson.c
+++ b/drivers/rtc/rtc-meson.c
@@ -369,7 +369,7 @@ static int meson_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		goto out_disable_vdd;
 
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret)
 		goto out_disable_vdd;
 
diff --git a/drivers/rtc/rtc-mpc5121.c b/drivers/rtc/rtc-mpc5121.c
index 5c2ce71aa044..bb2ea9bc56f2 100644
--- a/drivers/rtc/rtc-mpc5121.c
+++ b/drivers/rtc/rtc-mpc5121.c
@@ -371,7 +371,7 @@ static int mpc5121_rtc_probe(struct platform_device *op)
 		rtc->rtc->range_max = U32_MAX;
 	}
 
-	err = rtc_register_device(rtc->rtc);
+	err = devm_rtc_register_device(rtc->rtc);
 	if (err)
 		goto out_dispose2;
 
diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c
index 17bf5394e1e5..421b3b6071b6 100644
--- a/drivers/rtc/rtc-mrst.c
+++ b/drivers/rtc/rtc-mrst.c
@@ -361,7 +361,7 @@ static int vrtc_mrst_do_probe(struct device *dev, struct resource *iomem,
 		}
 	}
 
-	retval = rtc_register_device(mrst_rtc.rtc);
+	retval = devm_rtc_register_device(mrst_rtc.rtc);
 	if (retval)
 		goto cleanup0;
 
diff --git a/drivers/rtc/rtc-mt2712.c b/drivers/rtc/rtc-mt2712.c
index d5f691c8a035..cd92a9788351 100644
--- a/drivers/rtc/rtc-mt2712.c
+++ b/drivers/rtc/rtc-mt2712.c
@@ -352,7 +352,7 @@ static int mt2712_rtc_probe(struct platform_device *pdev)
 	mt2712_rtc->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	mt2712_rtc->rtc->range_max = MT2712_RTC_TIMESTAMP_END_2127;
 
-	return rtc_register_device(mt2712_rtc->rtc);
+	return devm_rtc_register_device(mt2712_rtc->rtc);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/rtc/rtc-mt6397.c b/drivers/rtc/rtc-mt6397.c
index 1894aded4c85..6655035e5164 100644
--- a/drivers/rtc/rtc-mt6397.c
+++ b/drivers/rtc/rtc-mt6397.c
@@ -301,7 +301,7 @@ static int mtk_rtc_probe(struct platform_device *pdev)
 
 	rtc->rtc_dev->ops = &mtk_rtc_ops;
 
-	return rtc_register_device(rtc->rtc_dev);
+	return devm_rtc_register_device(rtc->rtc_dev);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/rtc/rtc-mv.c b/drivers/rtc/rtc-mv.c
index d5f190e578e4..f8e2ecea1d8d 100644
--- a/drivers/rtc/rtc-mv.c
+++ b/drivers/rtc/rtc-mv.c
@@ -278,7 +278,7 @@ static int __init mv_rtc_probe(struct platform_device *pdev)
 	pdata->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	pdata->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	ret = rtc_register_device(pdata->rtc);
+	ret = devm_rtc_register_device(pdata->rtc);
 	if (!ret)
 		return 0;
 out:
diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c
index 0d253ce3a8f5..65b29b0fa548 100644
--- a/drivers/rtc/rtc-mxc.c
+++ b/drivers/rtc/rtc-mxc.c
@@ -408,7 +408,7 @@ static int mxc_rtc_probe(struct platform_device *pdev)
 			dev_err(&pdev->dev, "failed to enable irq wake\n");
 	}
 
-	ret = rtc_register_device(rtc);
+	ret = devm_rtc_register_device(rtc);
 
 	return ret;
 }
diff --git a/drivers/rtc/rtc-mxc_v2.c b/drivers/rtc/rtc-mxc_v2.c
index 91534560fe2a..0d73f6f0cf9e 100644
--- a/drivers/rtc/rtc-mxc_v2.c
+++ b/drivers/rtc/rtc-mxc_v2.c
@@ -354,7 +354,7 @@ static int mxc_rtc_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	ret = rtc_register_device(pdata->rtc);
+	ret = devm_rtc_register_device(pdata->rtc);
 	if (ret < 0)
 		clk_unprepare(pdata->clk);
 
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index e65f79fc7718..dc7db2477f88 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -886,7 +886,7 @@ static int omap_rtc_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret)
 		goto err;
 
diff --git a/drivers/rtc/rtc-pcap.c b/drivers/rtc/rtc-pcap.c
index 178bfb1dea21..8c7a98a5452c 100644
--- a/drivers/rtc/rtc-pcap.c
+++ b/drivers/rtc/rtc-pcap.c
@@ -163,7 +163,7 @@ static int __init pcap_rtc_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	return rtc_register_device(pcap_rtc->rtc);
+	return devm_rtc_register_device(pcap_rtc->rtc);
 }
 
 static int __exit pcap_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-pcf2123.c b/drivers/rtc/rtc-pcf2123.c
index c3691fa4210e..534ffc91eec1 100644
--- a/drivers/rtc/rtc-pcf2123.c
+++ b/drivers/rtc/rtc-pcf2123.c
@@ -434,7 +434,7 @@ static int pcf2123_probe(struct spi_device *spi)
 	rtc->range_max = RTC_TIMESTAMP_END_2099;
 	rtc->set_start_time = true;
 
-	ret = rtc_register_device(rtc);
+	ret = devm_rtc_register_device(rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-pcf2127.c b/drivers/rtc/rtc-pcf2127.c
index 432cd627359b..33fa8b17b79c 100644
--- a/drivers/rtc/rtc-pcf2127.c
+++ b/drivers/rtc/rtc-pcf2127.c
@@ -682,7 +682,7 @@ static int pcf2127_probe(struct device *dev, struct regmap *regmap,
 		return ret;
 	}
 
-	return rtc_register_device(pcf2127->rtc);
+	return devm_rtc_register_device(pcf2127->rtc);
 }
 
 #ifdef CONFIG_OF
diff --git a/drivers/rtc/rtc-pcf85063.c b/drivers/rtc/rtc-pcf85063.c
index c19f139e9b8d..e19cf2adbc35 100644
--- a/drivers/rtc/rtc-pcf85063.c
+++ b/drivers/rtc/rtc-pcf85063.c
@@ -614,7 +614,7 @@ static int pcf85063_probe(struct i2c_client *client)
 	pcf85063_clkout_register_clk(pcf85063);
 #endif
 
-	return rtc_register_device(pcf85063->rtc);
+	return devm_rtc_register_device(pcf85063->rtc);
 }
 
 #ifdef CONFIG_OF
diff --git a/drivers/rtc/rtc-pcf85363.c b/drivers/rtc/rtc-pcf85363.c
index 23cf14ca2c96..a574c8d15a5c 100644
--- a/drivers/rtc/rtc-pcf85363.c
+++ b/drivers/rtc/rtc-pcf85363.c
@@ -418,7 +418,7 @@ static int pcf85363_probe(struct i2c_client *client,
 			pcf85363->rtc->ops = &rtc_ops_alarm;
 	}
 
-	ret = rtc_register_device(pcf85363->rtc);
+	ret = devm_rtc_register_device(pcf85363->rtc);
 
 	for (i = 0; i < config->num_nvram; i++) {
 		nvmem_cfg[i].priv = pcf85363;
diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c
index 2dc30eafa639..de3e6c355f2e 100644
--- a/drivers/rtc/rtc-pcf8563.c
+++ b/drivers/rtc/rtc-pcf8563.c
@@ -582,7 +582,7 @@ static int pcf8563_probe(struct i2c_client *client,
 		}
 	}
 
-	err = rtc_register_device(pcf8563->rtc);
+	err = devm_rtc_register_device(pcf8563->rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-pic32.c b/drivers/rtc/rtc-pic32.c
index 2b6946744654..7fb9145c43bd 100644
--- a/drivers/rtc/rtc-pic32.c
+++ b/drivers/rtc/rtc-pic32.c
@@ -338,7 +338,7 @@ static int pic32_rtc_probe(struct platform_device *pdev)
 	pdata->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	pdata->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	ret = rtc_register_device(pdata->rtc);
+	ret = devm_rtc_register_device(pdata->rtc);
 	if (ret)
 		goto err_nortc;
 
diff --git a/drivers/rtc/rtc-pl030.c b/drivers/rtc/rtc-pl030.c
index ebe03eba8f5f..5a880516f3e8 100644
--- a/drivers/rtc/rtc-pl030.c
+++ b/drivers/rtc/rtc-pl030.c
@@ -121,7 +121,7 @@ static int pl030_probe(struct amba_device *dev, const struct amba_id *id)
 	if (ret)
 		goto err_irq;
 
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret)
 		goto err_reg;
 
diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c
index d4b2ab786126..224bbf096262 100644
--- a/drivers/rtc/rtc-pl031.c
+++ b/drivers/rtc/rtc-pl031.c
@@ -370,7 +370,7 @@ static int pl031_probe(struct amba_device *adev, const struct amba_id *id)
 	ldata->rtc->range_min = vendor->range_min;
 	ldata->rtc->range_max = vendor->range_max;
 
-	ret = rtc_register_device(ldata->rtc);
+	ret = devm_rtc_register_device(ldata->rtc);
 	if (ret)
 		goto out;
 
diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c
index b45ee2cb2c04..0d9dd6faabba 100644
--- a/drivers/rtc/rtc-pm8xxx.c
+++ b/drivers/rtc/rtc-pm8xxx.c
@@ -508,7 +508,7 @@ static int pm8xxx_rtc_probe(struct platform_device *pdev)
 		return rc;
 	}
 
-	return rtc_register_device(rtc_dd->rtc);
+	return devm_rtc_register_device(rtc_dd->rtc);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/rtc/rtc-ps3.c b/drivers/rtc/rtc-ps3.c
index f0336d691e6c..6b098734c715 100644
--- a/drivers/rtc/rtc-ps3.c
+++ b/drivers/rtc/rtc-ps3.c
@@ -56,7 +56,7 @@ static int __init ps3_rtc_probe(struct platform_device *dev)
 
 	platform_set_drvdata(dev, rtc);
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct platform_driver ps3_rtc_driver = {
diff --git a/drivers/rtc/rtc-r9701.c b/drivers/rtc/rtc-r9701.c
index 7ceb968f0e44..60a3c3d7499b 100644
--- a/drivers/rtc/rtc-r9701.c
+++ b/drivers/rtc/rtc-r9701.c
@@ -127,7 +127,7 @@ static int r9701_probe(struct spi_device *spi)
 	rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct spi_driver r9701_driver = {
diff --git a/drivers/rtc/rtc-rc5t619.c b/drivers/rtc/rtc-rc5t619.c
index dd1a20977478..e73102a39f1b 100644
--- a/drivers/rtc/rtc-rc5t619.c
+++ b/drivers/rtc/rtc-rc5t619.c
@@ -426,7 +426,7 @@ static int rc5t619_rtc_probe(struct platform_device *pdev)
 		dev_warn(&pdev->dev, "rc5t619 interrupt is disabled\n");
 	}
 
-	return rtc_register_device(rtc->rtc);
+	return devm_rtc_register_device(rtc->rtc);
 }
 
 static struct platform_driver rc5t619_rtc_driver = {
diff --git a/drivers/rtc/rtc-rk808.c b/drivers/rtc/rtc-rk808.c
index c0334c602e88..e920da8c08da 100644
--- a/drivers/rtc/rtc-rk808.c
+++ b/drivers/rtc/rtc-rk808.c
@@ -447,7 +447,7 @@ static int rk808_rtc_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	return rtc_register_device(rk808_rtc->rtc);
+	return devm_rtc_register_device(rk808_rtc->rtc);
 }
 
 static struct platform_driver rk808_rtc_driver = {
diff --git a/drivers/rtc/rtc-rp5c01.c b/drivers/rtc/rtc-rp5c01.c
index 8bc476c0905f..44afa6d996e7 100644
--- a/drivers/rtc/rtc-rp5c01.c
+++ b/drivers/rtc/rtc-rp5c01.c
@@ -259,7 +259,7 @@ static int __init rp5c01_rtc_probe(struct platform_device *dev)
 	if (error)
 		return error;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct platform_driver rp5c01_rtc_driver = {
diff --git a/drivers/rtc/rtc-rs5c348.c b/drivers/rtc/rtc-rs5c348.c
index 47c13678449e..fec633f80789 100644
--- a/drivers/rtc/rtc-rs5c348.c
+++ b/drivers/rtc/rtc-rs5c348.c
@@ -197,7 +197,7 @@ static int rs5c348_probe(struct spi_device *spi)
 
 	rtc->ops = &rs5c348_rtc_ops;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct spi_driver rs5c348_driver = {
diff --git a/drivers/rtc/rtc-rv3028.c b/drivers/rtc/rtc-rv3028.c
index f788df979750..979407a51c7a 100644
--- a/drivers/rtc/rtc-rv3028.c
+++ b/drivers/rtc/rtc-rv3028.c
@@ -886,7 +886,7 @@ static int rv3028_probe(struct i2c_client *client)
 	rv3028->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rv3028->rtc->range_max = RTC_TIMESTAMP_END_2099;
 	rv3028->rtc->ops = &rv3028_rtc_ops;
-	ret = rtc_register_device(rv3028->rtc);
+	ret = devm_rtc_register_device(rv3028->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-rv3029c2.c b/drivers/rtc/rtc-rv3029c2.c
index ad359b3b74b2..dc1bda62095e 100644
--- a/drivers/rtc/rtc-rv3029c2.c
+++ b/drivers/rtc/rtc-rv3029c2.c
@@ -750,7 +750,7 @@ static int rv3029_probe(struct device *dev, struct regmap *regmap, int irq,
 	rv3029->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rv3029->rtc->range_max = RTC_TIMESTAMP_END_2079;
 
-	rc = rtc_register_device(rv3029->rtc);
+	rc = devm_rtc_register_device(rv3029->rtc);
 	if (rc)
 		return rc;
 
diff --git a/drivers/rtc/rtc-rv3032.c b/drivers/rtc/rtc-rv3032.c
index ed9cba3292e6..c9bcea727757 100644
--- a/drivers/rtc/rtc-rv3032.c
+++ b/drivers/rtc/rtc-rv3032.c
@@ -885,7 +885,7 @@ static int rv3032_probe(struct i2c_client *client)
 	rv3032->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rv3032->rtc->range_max = RTC_TIMESTAMP_END_2099;
 	rv3032->rtc->ops = &rv3032_rtc_ops;
-	ret = rtc_register_device(rv3032->rtc);
+	ret = devm_rtc_register_device(rv3032->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-rv8803.c b/drivers/rtc/rtc-rv8803.c
index 44e1818a751c..d4ea6db51b26 100644
--- a/drivers/rtc/rtc-rv8803.c
+++ b/drivers/rtc/rtc-rv8803.c
@@ -587,7 +587,7 @@ static int rv8803_probe(struct i2c_client *client,
 	rv8803->rtc->ops = &rv8803_rtc_ops;
 	rv8803->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rv8803->rtc->range_max = RTC_TIMESTAMP_END_2099;
-	err = rtc_register_device(rv8803->rtc);
+	err = devm_rtc_register_device(rv8803->rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-rx8010.c b/drivers/rtc/rtc-rx8010.c
index dca41a2a39b2..8340ab47a059 100644
--- a/drivers/rtc/rtc-rx8010.c
+++ b/drivers/rtc/rtc-rx8010.c
@@ -419,7 +419,7 @@ static int rx8010_probe(struct i2c_client *client)
 	rx8010->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	rx8010->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	return rtc_register_device(rx8010->rtc);
+	return devm_rtc_register_device(rx8010->rtc);
 }
 
 static struct i2c_driver rx8010_driver = {
diff --git a/drivers/rtc/rtc-rx8581.c b/drivers/rtc/rtc-rx8581.c
index 017f74721cc0..de109139529b 100644
--- a/drivers/rtc/rtc-rx8581.c
+++ b/drivers/rtc/rtc-rx8581.c
@@ -298,7 +298,7 @@ static int rx8581_probe(struct i2c_client *client,
 	rx8581->rtc->start_secs = 0;
 	rx8581->rtc->set_start_time = true;
 
-	ret = rtc_register_device(rx8581->rtc);
+	ret = devm_rtc_register_device(rx8581->rtc);
 
 	for (i = 0; i < config->num_nvram; i++) {
 		nvmem_cfg[i].priv = rx8581;
diff --git a/drivers/rtc/rtc-s35390a.c b/drivers/rtc/rtc-s35390a.c
index 03672a246356..ea15d0392bb9 100644
--- a/drivers/rtc/rtc-s35390a.c
+++ b/drivers/rtc/rtc-s35390a.c
@@ -497,7 +497,7 @@ static int s35390a_probe(struct i2c_client *client,
 	if (status1 & S35390A_FLAG_INT2)
 		rtc_update_irq(s35390a->rtc, 1, RTC_AF);
 
-	return rtc_register_device(s35390a->rtc);
+	return devm_rtc_register_device(s35390a->rtc);
 }
 
 static struct i2c_driver s35390a_driver = {
diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c
index 9ccc97cf5e09..1250887e4382 100644
--- a/drivers/rtc/rtc-sa1100.c
+++ b/drivers/rtc/rtc-sa1100.c
@@ -205,7 +205,7 @@ int sa1100_rtc_init(struct platform_device *pdev, struct sa1100_rtc *info)
 	info->rtc->max_user_freq = RTC_FREQ;
 	info->rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(info->rtc);
+	ret = devm_rtc_register_device(info->rtc);
 	if (ret) {
 		clk_disable_unprepare(info->clk);
 		return ret;
diff --git a/drivers/rtc/rtc-sc27xx.c b/drivers/rtc/rtc-sc27xx.c
index a953bc0a5a5b..187aa955b79c 100644
--- a/drivers/rtc/rtc-sc27xx.c
+++ b/drivers/rtc/rtc-sc27xx.c
@@ -618,7 +618,7 @@ static int sprd_rtc_probe(struct platform_device *pdev)
 	rtc->rtc->ops = &sprd_rtc_ops;
 	rtc->rtc->range_min = 0;
 	rtc->rtc->range_max = 5662310399LL;
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret) {
 		device_init_wakeup(&pdev->dev, 0);
 		return ret;
diff --git a/drivers/rtc/rtc-sd3078.c b/drivers/rtc/rtc-sd3078.c
index a7aa943c1183..f6bee69ba017 100644
--- a/drivers/rtc/rtc-sd3078.c
+++ b/drivers/rtc/rtc-sd3078.c
@@ -192,7 +192,7 @@ static int sd3078_probe(struct i2c_client *client,
 	sd3078->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	sd3078->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	ret = rtc_register_device(sd3078->rtc);
+	ret = devm_rtc_register_device(sd3078->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c
index 9167b48014a1..cd146b574143 100644
--- a/drivers/rtc/rtc-sh.c
+++ b/drivers/rtc/rtc-sh.c
@@ -607,7 +607,7 @@ static int __init sh_rtc_probe(struct platform_device *pdev)
 		rtc->rtc_dev->range_max = mktime64(2098, 12, 31, 23, 59, 59);
 	}
 
-	ret = rtc_register_device(rtc->rtc_dev);
+	ret = devm_rtc_register_device(rtc->rtc_dev);
 	if (ret)
 		goto err_unmap;
 
diff --git a/drivers/rtc/rtc-sirfsoc.c b/drivers/rtc/rtc-sirfsoc.c
index abf19435dbad..03a6cca23201 100644
--- a/drivers/rtc/rtc-sirfsoc.c
+++ b/drivers/rtc/rtc-sirfsoc.c
@@ -356,7 +356,7 @@ static int sirfsoc_rtc_probe(struct platform_device *pdev)
 		return err;
 	}
 
-	return rtc_register_device(rtcdrv->rtc);
+	return devm_rtc_register_device(rtcdrv->rtc);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/rtc/rtc-snvs.c b/drivers/rtc/rtc-snvs.c
index a7d39a49b748..bd929b0e7d7d 100644
--- a/drivers/rtc/rtc-snvs.c
+++ b/drivers/rtc/rtc-snvs.c
@@ -387,7 +387,7 @@ static int snvs_rtc_probe(struct platform_device *pdev)
 	data->rtc->ops = &snvs_rtc_ops;
 	data->rtc->range_max = U32_MAX;
 
-	return rtc_register_device(data->rtc);
+	return devm_rtc_register_device(data->rtc);
 }
 
 static int __maybe_unused snvs_rtc_suspend_noirq(struct device *dev)
diff --git a/drivers/rtc/rtc-st-lpc.c b/drivers/rtc/rtc-st-lpc.c
index 0c65448b85ee..bdb20f63254e 100644
--- a/drivers/rtc/rtc-st-lpc.c
+++ b/drivers/rtc/rtc-st-lpc.c
@@ -250,7 +250,7 @@ static int st_rtc_probe(struct platform_device *pdev)
 	rtc->rtc_dev->range_max = U64_MAX;
 	do_div(rtc->rtc_dev->range_max, rtc->clkrate);
 
-	ret = rtc_register_device(rtc->rtc_dev);
+	ret = devm_rtc_register_device(rtc->rtc_dev);
 	if (ret) {
 		clk_disable_unprepare(rtc->clk);
 		return ret;
diff --git a/drivers/rtc/rtc-starfire.c b/drivers/rtc/rtc-starfire.c
index 37a26279e107..fbd1ed41cbf1 100644
--- a/drivers/rtc/rtc-starfire.c
+++ b/drivers/rtc/rtc-starfire.c
@@ -48,7 +48,7 @@ static int __init starfire_rtc_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, rtc);
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct platform_driver starfire_rtc_driver = {
diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c
index ad616bce7bca..7cb6be1b7815 100644
--- a/drivers/rtc/rtc-stk17ta8.c
+++ b/drivers/rtc/rtc-stk17ta8.c
@@ -317,7 +317,7 @@ static int stk17ta8_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return rtc_register_device(pdata->rtc);
+	return devm_rtc_register_device(pdata->rtc);
 }
 
 /* work with hotplug and coldplug */
diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c
index 0a969af80af7..40c0f7ed36e0 100644
--- a/drivers/rtc/rtc-stmp3xxx.c
+++ b/drivers/rtc/rtc-stmp3xxx.c
@@ -366,7 +366,7 @@ static int stmp3xxx_rtc_probe(struct platform_device *pdev)
 	rtc_data->rtc->ops = &stmp3xxx_rtc_ops;
 	rtc_data->rtc->range_max = U32_MAX;
 
-	err = rtc_register_device(rtc_data->rtc);
+	err = devm_rtc_register_device(rtc_data->rtc);
 	if (err)
 		return err;
 
diff --git a/drivers/rtc/rtc-sun4v.c b/drivers/rtc/rtc-sun4v.c
index 036463dfa103..a86e27de8c06 100644
--- a/drivers/rtc/rtc-sun4v.c
+++ b/drivers/rtc/rtc-sun4v.c
@@ -86,7 +86,7 @@ static int __init sun4v_rtc_probe(struct platform_device *pdev)
 	rtc->range_max = U64_MAX;
 	platform_set_drvdata(pdev, rtc);
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct platform_driver sun4v_rtc_driver = {
diff --git a/drivers/rtc/rtc-sun6i.c b/drivers/rtc/rtc-sun6i.c
index f2818cdd11d8..adec1b14a8de 100644
--- a/drivers/rtc/rtc-sun6i.c
+++ b/drivers/rtc/rtc-sun6i.c
@@ -726,7 +726,7 @@ static int sun6i_rtc_probe(struct platform_device *pdev)
 	chip->rtc->ops = &sun6i_rtc_ops;
 	chip->rtc->range_max = 2019686399LL; /* 2033-12-31 23:59:59 */
 
-	ret = rtc_register_device(chip->rtc);
+	ret = devm_rtc_register_device(chip->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-sunxi.c b/drivers/rtc/rtc-sunxi.c
index f5d7f44550ce..5d019e3a835a 100644
--- a/drivers/rtc/rtc-sunxi.c
+++ b/drivers/rtc/rtc-sunxi.c
@@ -470,7 +470,7 @@ static int sunxi_rtc_probe(struct platform_device *pdev)
 
 	chip->rtc->ops = &sunxi_rtc_ops;
 
-	return rtc_register_device(chip->rtc);
+	return devm_rtc_register_device(chip->rtc);
 }
 
 static struct platform_driver sunxi_rtc_driver = {
diff --git a/drivers/rtc/rtc-tegra.c b/drivers/rtc/rtc-tegra.c
index 7fbb1741692f..8925015cc698 100644
--- a/drivers/rtc/rtc-tegra.c
+++ b/drivers/rtc/rtc-tegra.c
@@ -329,7 +329,7 @@ static int tegra_rtc_probe(struct platform_device *pdev)
 		goto disable_clk;
 	}
 
-	ret = rtc_register_device(info->rtc);
+	ret = devm_rtc_register_device(info->rtc);
 	if (ret)
 		goto disable_clk;
 
diff --git a/drivers/rtc/rtc-test.c b/drivers/rtc/rtc-test.c
index 74b3a0603b73..b092a1648513 100644
--- a/drivers/rtc/rtc-test.c
+++ b/drivers/rtc/rtc-test.c
@@ -139,7 +139,7 @@ static int test_probe(struct platform_device *plat_dev)
 	timer_setup(&rtd->alarm, test_rtc_alarm_handler, 0);
 	rtd->alarm.expires = 0;
 
-	return rtc_register_device(rtd->rtc);
+	return devm_rtc_register_device(rtd->rtc);
 }
 
 static struct platform_driver test_driver = {
diff --git a/drivers/rtc/rtc-tps6586x.c b/drivers/rtc/rtc-tps6586x.c
index e39af2d67051..a980337c3065 100644
--- a/drivers/rtc/rtc-tps6586x.c
+++ b/drivers/rtc/rtc-tps6586x.c
@@ -280,7 +280,7 @@ static int tps6586x_rtc_probe(struct platform_device *pdev)
 		goto fail_rtc_register;
 	}
 
-	ret = rtc_register_device(rtc->rtc);
+	ret = devm_rtc_register_device(rtc->rtc);
 	if (ret)
 		goto fail_rtc_register;
 
diff --git a/drivers/rtc/rtc-tps65910.c b/drivers/rtc/rtc-tps65910.c
index e3840386f430..2d87b62826a8 100644
--- a/drivers/rtc/rtc-tps65910.c
+++ b/drivers/rtc/rtc-tps65910.c
@@ -434,7 +434,7 @@ static int tps65910_rtc_probe(struct platform_device *pdev)
 	tps_rtc->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
 	tps_rtc->rtc->range_max = RTC_TIMESTAMP_END_2099;
 
-	return rtc_register_device(tps_rtc->rtc);
+	return devm_rtc_register_device(tps_rtc->rtc);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c
index 11f46272bad3..c3309db5448d 100644
--- a/drivers/rtc/rtc-tx4939.c
+++ b/drivers/rtc/rtc-tx4939.c
@@ -275,7 +275,7 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static int __exit tx4939_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c
index c3671043ace7..5a9f9ad86d32 100644
--- a/drivers/rtc/rtc-vr41xx.c
+++ b/drivers/rtc/rtc-vr41xx.c
@@ -335,7 +335,7 @@ static int rtc_probe(struct platform_device *pdev)
 
 	dev_info(&pdev->dev, "Real Time Clock of NEC VR4100 series\n");
 
-	retval = rtc_register_device(rtc);
+	retval = devm_rtc_register_device(rtc);
 	if (retval)
 		goto err_iounmap_all;
 
diff --git a/drivers/rtc/rtc-vt8500.c b/drivers/rtc/rtc-vt8500.c
index e2588625025f..197b649cd629 100644
--- a/drivers/rtc/rtc-vt8500.c
+++ b/drivers/rtc/rtc-vt8500.c
@@ -232,7 +232,7 @@ static int vt8500_rtc_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	return rtc_register_device(vt8500_rtc->rtc);
+	return devm_rtc_register_device(vt8500_rtc->rtc);
 }
 
 static int vt8500_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/rtc-wilco-ec.c b/drivers/rtc/rtc-wilco-ec.c
index ff46066a68a4..2a205a646452 100644
--- a/drivers/rtc/rtc-wilco-ec.c
+++ b/drivers/rtc/rtc-wilco-ec.c
@@ -176,7 +176,7 @@ static int wilco_ec_rtc_probe(struct platform_device *pdev)
 	rtc->range_max = RTC_TIMESTAMP_END_2099;
 	rtc->owner = THIS_MODULE;
 
-	return rtc_register_device(rtc);
+	return devm_rtc_register_device(rtc);
 }
 
 static struct platform_driver wilco_ec_rtc_driver = {
diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c
index ccef887d2690..640833e21057 100644
--- a/drivers/rtc/rtc-wm831x.c
+++ b/drivers/rtc/rtc-wm831x.c
@@ -429,7 +429,7 @@ static int wm831x_rtc_probe(struct platform_device *pdev)
 	wm831x_rtc->rtc->ops = &wm831x_rtc_ops;
 	wm831x_rtc->rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(wm831x_rtc->rtc);
+	ret = devm_rtc_register_device(wm831x_rtc->rtc);
 	if (ret)
 		return ret;
 
diff --git a/drivers/rtc/rtc-xgene.c b/drivers/rtc/rtc-xgene.c
index 96db441f92b3..cf68a9b1c9eb 100644
--- a/drivers/rtc/rtc-xgene.c
+++ b/drivers/rtc/rtc-xgene.c
@@ -185,7 +185,7 @@ static int xgene_rtc_probe(struct platform_device *pdev)
 	pdata->rtc->ops = &xgene_rtc_ops;
 	pdata->rtc->range_max = U32_MAX;
 
-	ret = rtc_register_device(pdata->rtc);
+	ret = devm_rtc_register_device(pdata->rtc);
 	if (ret) {
 		clk_disable_unprepare(pdata->clk);
 		return ret;
diff --git a/drivers/rtc/rtc-zynqmp.c b/drivers/rtc/rtc-zynqmp.c
index 4b1077e2f826..f440bb52be92 100644
--- a/drivers/rtc/rtc-zynqmp.c
+++ b/drivers/rtc/rtc-zynqmp.c
@@ -264,7 +264,7 @@ static int xlnx_rtc_probe(struct platform_device *pdev)
 
 	device_init_wakeup(&pdev->dev, 1);
 
-	return rtc_register_device(xrtcdev->rtc);
+	return devm_rtc_register_device(xrtcdev->rtc);
 }
 
 static int xlnx_rtc_remove(struct platform_device *pdev)
diff --git a/drivers/rtc/sysfs.c b/drivers/rtc/sysfs.c
index 950fac0d41ff..8a957d31a1a4 100644
--- a/drivers/rtc/sysfs.c
+++ b/drivers/rtc/sysfs.c
@@ -317,8 +317,6 @@ int rtc_add_groups(struct rtc_device *rtc, const struct attribute_group **grps)
 	size_t old_cnt = 0, add_cnt = 0, new_cnt;
 	const struct attribute_group **groups, **old;
 
-	if (rtc->registered)
-		return -EINVAL;
 	if (!grps)
 		return -EINVAL;
 
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index cbca651d8ca4..55e7beed066c 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -118,8 +118,6 @@ struct rtc_device {
 	 */
 	long set_offset_nsec;
 
-	bool registered;
-
 	time64_t range_min;
 	timeu64_t range_max;
 	time64_t start_secs;
@@ -157,7 +155,7 @@ extern struct rtc_device *devm_rtc_device_register(struct device *dev,
 					const struct rtc_class_ops *ops,
 					struct module *owner);
 struct rtc_device *devm_rtc_allocate_device(struct device *dev);
-int __rtc_register_device(struct module *owner, struct rtc_device *rtc);
+int __devm_rtc_register_device(struct module *owner, struct rtc_device *rtc);
 
 extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
@@ -234,8 +232,8 @@ static inline bool rtc_tv_nsec_ok(s64 set_offset_nsec,
 	return false;
 }
 
-#define rtc_register_device(device) \
-	__rtc_register_device(THIS_MODULE, device)
+#define devm_rtc_register_device(device) \
+	__devm_rtc_register_device(THIS_MODULE, device)
 
 #ifdef CONFIG_RTC_HCTOSYS_DEVICE
 extern int rtc_hctosys_ret;
-- 
cgit v1.2.3


From ded5ed04d85e299770dcb7e82c2127b8054a00c8 Mon Sep 17 00:00:00 2001
From: Souradeep Chowdhury <schowdhu@codeaurora.org>
Date: Wed, 30 Sep 2020 13:44:13 +0530
Subject: soc: qcom: llcc: Add configuration data for SM8150

Add LLCC configuration data for SM8150 SoC which controls
LLCC behaviour.

Signed-off-by: Souradeep Chowdhury <schowdhu@codeaurora.org>
Link: https://lore.kernel.org/r/957e3ae50c75720ef6227529d5ce3d4b457802e9.1601452132.git.schowdhu@codeaurora.org
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/soc/qcom/llcc-qcom.c       | 30 ++++++++++++++++++++++++++++++
 include/linux/soc/qcom/llcc-qcom.h |  6 ++++++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c
index 96c20e673436..16b421608e9c 100644
--- a/drivers/soc/qcom/llcc-qcom.c
+++ b/drivers/soc/qcom/llcc-qcom.c
@@ -123,6 +123,30 @@ static const struct llcc_slice_config sdm845_data[] =  {
 	{ LLCC_AUDHW,    22, 1024, 1, 1, 0xffc, 0x2,   0, 0, 1, 1, 0 },
 };
 
+static const struct llcc_slice_config sm8150_data[] =  {
+	{  LLCC_CPUSS,    1, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 1 },
+	{  LLCC_VIDSC0,   2, 512,  2, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_VIDSC1,   3, 512,  2, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_AUDIO,    6, 1024, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_MDMHPGRW, 7, 3072, 1, 0, 0xFF,  0xF00, 0, 0, 0, 1, 0 },
+	{  LLCC_MDM,      8, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_MODHW,    9, 1024, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_CMPT,    10, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_GPUHTW , 11, 512,  1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_GPU,     12, 2560, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_MMUHWT,  13, 1024, 1, 1, 0xFFF, 0x0,   0, 0, 0, 0, 1 },
+	{  LLCC_CMPTDMA, 15, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_DISP,    16, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_MDMHPFX, 20, 1024, 2, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_MDMHPFX, 21, 1024, 0, 1, 0xF,   0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_AUDHW,   22, 1024, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_NPU,     23, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_WLHW,    24, 3072, 1, 1, 0xFFF, 0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_MODPE,   29, 256,  1, 1, 0xF,   0x0,   0, 0, 0, 1, 0 },
+	{  LLCC_APTCM,   30, 256,  3, 1, 0x0,   0x1,   1, 0, 0, 1, 0 },
+	{  LLCC_WRCACHE, 31, 128,  1, 1, 0xFFF, 0x0,   0, 0, 0, 0, 0 },
+};
+
 static const struct qcom_llcc_config sc7180_cfg = {
 	.sct_data	= sc7180_data,
 	.size		= ARRAY_SIZE(sc7180_data),
@@ -135,6 +159,11 @@ static const struct qcom_llcc_config sdm845_cfg = {
 	.need_llcc_cfg	= false,
 };
 
+static const struct qcom_llcc_config sm8150_cfg = {
+	.sct_data       = sm8150_data,
+	.size           = ARRAY_SIZE(sm8150_data),
+};
+
 static struct llcc_drv_data *drv_data = (void *) -EPROBE_DEFER;
 
 /**
@@ -529,6 +558,7 @@ err:
 static const struct of_device_id qcom_llcc_of_match[] = {
 	{ .compatible = "qcom,sc7180-llcc", .data = &sc7180_cfg },
 	{ .compatible = "qcom,sdm845-llcc", .data = &sdm845_cfg },
+	{ .compatible = "qcom,sm8150-llcc", .data = &sm8150_cfg },
 	{ }
 };
 
diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h
index 90b864655822..3db6797ba6ff 100644
--- a/include/linux/soc/qcom/llcc-qcom.h
+++ b/include/linux/soc/qcom/llcc-qcom.h
@@ -16,6 +16,7 @@
 #define LLCC_AUDIO       6
 #define LLCC_MDMHPGRW    7
 #define LLCC_MDM         8
+#define LLCC_MODHW       9
 #define LLCC_CMPT        10
 #define LLCC_GPUHTW      11
 #define LLCC_GPU         12
@@ -26,6 +27,11 @@
 #define LLCC_MDMHPFX     20
 #define LLCC_MDMPNG      21
 #define LLCC_AUDHW       22
+#define LLCC_NPU         23
+#define LLCC_WLHW        24
+#define LLCC_MODPE       29
+#define LLCC_APTCM       30
+#define LLCC_WRCACHE     31
 
 /**
  * llcc_slice_desc - Cache slice descriptor
-- 
cgit v1.2.3


From 607a4672b458b12674b96724e2f9bd42a5e928c6 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Fri, 20 Nov 2020 10:55:17 +0000
Subject: firmware: arm_scmi: Add full list of sensor type enumeration

SCMI v2.0 provides a big list of sensor type enumeration from the
sensorUnits enumeration table of Distributed Management Task Force(DMTF)
specification number DSP 0248 (Platform Level Data Model for Platform
Monitoring and Control Specification). It is however not an exact
replica of the sensorUnits enumeration table.

Let us just update the table as per SCMI v2.0 specification.

Link: https://lore.kernel.org/r/20201119174906.43862-3-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_protocol.h | 81 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 9cd312a1ff92..13d75956aa91 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -163,11 +163,92 @@ struct scmi_sensor_info {
  */
 enum scmi_sensor_class {
 	NONE = 0x0,
+	UNSPEC = 0x1,
 	TEMPERATURE_C = 0x2,
+	TEMPERATURE_F = 0x3,
+	TEMPERATURE_K = 0x4,
 	VOLTAGE = 0x5,
 	CURRENT = 0x6,
 	POWER = 0x7,
 	ENERGY = 0x8,
+	CHARGE = 0x9,
+	VOLTAMPERE = 0xA,
+	NITS = 0xB,
+	LUMENS = 0xC,
+	LUX = 0xD,
+	CANDELAS = 0xE,
+	KPA = 0xF,
+	PSI = 0x10,
+	NEWTON = 0x11,
+	CFM = 0x12,
+	RPM = 0x13,
+	HERTZ = 0x14,
+	SECS = 0x15,
+	MINS = 0x16,
+	HOURS = 0x17,
+	DAYS = 0x18,
+	WEEKS = 0x19,
+	MILS = 0x1A,
+	INCHES = 0x1B,
+	FEET = 0x1C,
+	CUBIC_INCHES = 0x1D,
+	CUBIC_FEET = 0x1E,
+	METERS = 0x1F,
+	CUBIC_CM = 0x20,
+	CUBIC_METERS = 0x21,
+	LITERS = 0x22,
+	FLUID_OUNCES = 0x23,
+	RADIANS = 0x24,
+	STERADIANS = 0x25,
+	REVOLUTIONS = 0x26,
+	CYCLES = 0x27,
+	GRAVITIES = 0x28,
+	OUNCES = 0x29,
+	POUNDS = 0x2A,
+	FOOT_POUNDS = 0x2B,
+	OUNCE_INCHES = 0x2C,
+	GAUSS = 0x2D,
+	GILBERTS = 0x2E,
+	HENRIES = 0x2F,
+	FARADS = 0x30,
+	OHMS = 0x31,
+	SIEMENS = 0x32,
+	MOLES = 0x33,
+	BECQUERELS = 0x34,
+	PPM = 0x35,
+	DECIBELS = 0x36,
+	DBA = 0x37,
+	DBC = 0x38,
+	GRAYS = 0x39,
+	SIEVERTS = 0x3A,
+	COLOR_TEMP_K = 0x3B,
+	BITS = 0x3C,
+	BYTES = 0x3D,
+	WORDS = 0x3E,
+	DWORDS = 0x3F,
+	QWORDS = 0x40,
+	PERCENTAGE = 0x41,
+	PASCALS = 0x42,
+	COUNTS = 0x43,
+	GRAMS = 0x44,
+	NEWTON_METERS = 0x45,
+	HITS = 0x46,
+	MISSES = 0x47,
+	RETRIES = 0x48,
+	OVERRUNS = 0x49,
+	UNDERRUNS = 0x4A,
+	COLLISIONS = 0x4B,
+	PACKETS = 0x4C,
+	MESSAGES = 0x4D,
+	CHARS = 0x4E,
+	ERRORS = 0x4F,
+	CORRECTED_ERRS = 0x50,
+	UNCORRECTABLE_ERRS = 0x51,
+	SQ_MILS = 0x52,
+	SQ_INCHES = 0x53,
+	SQ_FEET = 0x54,
+	SQ_CM = 0x55,
+	SQ_METERS = 0x56,
 };
 
 /**
-- 
cgit v1.2.3


From 1fe00b8b4276ddf335216f884cb719edbea129e1 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Thu, 19 Nov 2020 17:49:02 +0000
Subject: firmware: arm_scmi: Add SCMI v3.0 sensors descriptors extensions

Add support for new SCMI v3.0 Sensors extensions related to new sensors'
features, like multiple axis and update intervals, while keeping
compatibility with SCMI v2.0 features.

While at that, refactor and simplify all the internal helpers macros and
move struct scmi_sensor_info to use only non-fixed-size typing.

Link: https://lore.kernel.org/r/20201119174906.43862-3-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 390 ++++++++++++++++++++++++++++++++++--
 include/linux/scmi_protocol.h       | 139 ++++++++++++-
 2 files changed, 504 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index 6aaff478d032..a85827f60a02 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -7,16 +7,22 @@
 
 #define pr_fmt(fmt) "SCMI Notifications SENSOR - " fmt
 
+#include <linux/bitfield.h>
 #include <linux/scmi_protocol.h>
 
 #include "common.h"
 #include "notify.h"
 
+#define SCMI_MAX_NUM_SENSOR_AXIS	63
+#define	SCMIv2_SENSOR_PROTOCOL		0x10000
+
 enum scmi_sensor_protocol_cmd {
 	SENSOR_DESCRIPTION_GET = 0x3,
 	SENSOR_TRIP_POINT_NOTIFY = 0x4,
 	SENSOR_TRIP_POINT_CONFIG = 0x5,
 	SENSOR_READING_GET = 0x6,
+	SENSOR_AXIS_DESCRIPTION_GET = 0x7,
+	SENSOR_LIST_UPDATE_INTERVALS = 0x8,
 };
 
 struct scmi_msg_resp_sensor_attributes {
@@ -28,23 +34,100 @@ struct scmi_msg_resp_sensor_attributes {
 	__le32 reg_size;
 };
 
+/* v3 attributes_low macros */
+#define SUPPORTS_UPDATE_NOTIFY(x)	FIELD_GET(BIT(30), (x))
+#define SENSOR_TSTAMP_EXP(x)		FIELD_GET(GENMASK(14, 10), (x))
+#define SUPPORTS_TIMESTAMP(x)		FIELD_GET(BIT(9), (x))
+#define SUPPORTS_EXTEND_ATTRS(x)	FIELD_GET(BIT(8), (x))
+
+/* v2 attributes_high macros */
+#define SENSOR_UPDATE_BASE(x)		FIELD_GET(GENMASK(31, 27), (x))
+#define SENSOR_UPDATE_SCALE(x)		FIELD_GET(GENMASK(26, 22), (x))
+
+/* v3 attributes_high macros */
+#define SENSOR_AXIS_NUMBER(x)		FIELD_GET(GENMASK(21, 16), (x))
+#define SUPPORTS_AXIS(x)		FIELD_GET(BIT(8), (x))
+
+/* v3 resolution macros */
+#define SENSOR_RES(x)			FIELD_GET(GENMASK(26, 0), (x))
+#define SENSOR_RES_EXP(x)		FIELD_GET(GENMASK(31, 27), (x))
+
+struct scmi_msg_resp_attrs {
+	__le32 min_range_low;
+	__le32 min_range_high;
+	__le32 max_range_low;
+	__le32 max_range_high;
+};
+
 struct scmi_msg_resp_sensor_description {
 	__le16 num_returned;
 	__le16 num_remaining;
-	struct {
+	struct scmi_sensor_descriptor {
+		__le32 id;
+		__le32 attributes_low;
+/* Common attributes_low macros */
+#define SUPPORTS_ASYNC_READ(x)		FIELD_GET(BIT(31), (x))
+#define NUM_TRIP_POINTS(x)		FIELD_GET(GENMASK(7, 0), (x))
+		__le32 attributes_high;
+/* Common attributes_high macros */
+#define SENSOR_SCALE(x)			FIELD_GET(GENMASK(15, 11), (x))
+#define SENSOR_SCALE_SIGN		BIT(4)
+#define SENSOR_SCALE_EXTEND		GENMASK(31, 5)
+#define SENSOR_TYPE(x)			FIELD_GET(GENMASK(7, 0), (x))
+		u8 name[SCMI_MAX_STR_SIZE];
+		/* only for version > 2.0 */
+		__le32 power;
+		__le32 resolution;
+		struct scmi_msg_resp_attrs scalar_attrs;
+	} desc[];
+};
+
+/* Base scmi_sensor_descriptor size excluding extended attrs after name */
+#define SCMI_MSG_RESP_SENS_DESCR_BASE_SZ	28
+
+/* Sign extend to a full s32 */
+#define	S32_EXT(v)							\
+	({								\
+		int __v = (v);						\
+									\
+		if (__v & SENSOR_SCALE_SIGN)				\
+			__v |= SENSOR_SCALE_EXTEND;			\
+		__v;							\
+	})
+
+struct scmi_msg_sensor_axis_description_get {
+	__le32 id;
+	__le32 axis_desc_index;
+};
+
+struct scmi_msg_resp_sensor_axis_description {
+	__le32 num_axis_flags;
+#define NUM_AXIS_RETURNED(x)		FIELD_GET(GENMASK(5, 0), (x))
+#define NUM_AXIS_REMAINING(x)		FIELD_GET(GENMASK(31, 26), (x))
+	struct scmi_axis_descriptor {
 		__le32 id;
 		__le32 attributes_low;
-#define SUPPORTS_ASYNC_READ(x)	((x) & BIT(31))
-#define NUM_TRIP_POINTS(x)	((x) & 0xff)
 		__le32 attributes_high;
-#define SENSOR_TYPE(x)		((x) & 0xff)
-#define SENSOR_SCALE(x)		(((x) >> 11) & 0x1f)
-#define SENSOR_SCALE_SIGN	BIT(4)
-#define SENSOR_SCALE_EXTEND	GENMASK(7, 5)
-#define SENSOR_UPDATE_SCALE(x)	(((x) >> 22) & 0x1f)
-#define SENSOR_UPDATE_BASE(x)	(((x) >> 27) & 0x1f)
-		    u8 name[SCMI_MAX_STR_SIZE];
-	} desc[0];
+		u8 name[SCMI_MAX_STR_SIZE];
+		__le32 resolution;
+		struct scmi_msg_resp_attrs attrs;
+	} desc[];
+};
+
+/* Base scmi_axis_descriptor size excluding extended attrs after name */
+#define SCMI_MSG_RESP_AXIS_DESCR_BASE_SZ	28
+
+struct scmi_msg_sensor_list_update_intervals {
+	__le32 id;
+	__le32 index;
+};
+
+struct scmi_msg_resp_sensor_list_update_intervals {
+	__le32 num_intervals_flags;
+#define NUM_INTERVALS_RETURNED(x)	FIELD_GET(GENMASK(11, 0), (x))
+#define SEGMENTED_INTVL_FORMAT(x)	FIELD_GET(BIT(12), (x))
+#define NUM_INTERVALS_REMAINING(x)	FIELD_GET(GENMASK(31, 16), (x))
+	__le32 intervals[];
 };
 
 struct scmi_msg_sensor_trip_point_notify {
@@ -114,6 +197,194 @@ static int scmi_sensor_attributes_get(const struct scmi_handle *handle,
 	return ret;
 }
 
+static inline void scmi_parse_range_attrs(struct scmi_range_attrs *out,
+					  struct scmi_msg_resp_attrs *in)
+{
+	out->min_range = get_unaligned_le64((void *)&in->min_range_low);
+	out->max_range = get_unaligned_le64((void *)&in->max_range_low);
+}
+
+static int scmi_sensor_update_intervals(const struct scmi_handle *handle,
+					struct scmi_sensor_info *s)
+{
+	int ret, cnt;
+	u32 desc_index = 0;
+	u16 num_returned, num_remaining;
+	struct scmi_xfer *ti;
+	struct scmi_msg_resp_sensor_list_update_intervals *buf;
+	struct scmi_msg_sensor_list_update_intervals *msg;
+
+	ret = scmi_xfer_get_init(handle, SENSOR_LIST_UPDATE_INTERVALS,
+				 SCMI_PROTOCOL_SENSOR, sizeof(*msg), 0, &ti);
+	if (ret)
+		return ret;
+
+	buf = ti->rx.buf;
+	do {
+		u32 flags;
+
+		msg = ti->tx.buf;
+		/* Set the number of sensors to be skipped/already read */
+		msg->id = cpu_to_le32(s->id);
+		msg->index = cpu_to_le32(desc_index);
+
+		ret = scmi_do_xfer(handle, ti);
+		if (ret)
+			break;
+
+		flags = le32_to_cpu(buf->num_intervals_flags);
+		num_returned = NUM_INTERVALS_RETURNED(flags);
+		num_remaining = NUM_INTERVALS_REMAINING(flags);
+
+		/*
+		 * Max intervals is not declared previously anywhere so we
+		 * assume it's returned+remaining.
+		 */
+		if (!s->intervals.count) {
+			s->intervals.segmented = SEGMENTED_INTVL_FORMAT(flags);
+			s->intervals.count = num_returned + num_remaining;
+			/* segmented intervals are reported in one triplet */
+			if (s->intervals.segmented &&
+			    (num_remaining || num_returned != 3)) {
+				dev_err(handle->dev,
+					"Sensor ID:%d advertises an invalid segmented interval (%d)\n",
+					s->id, s->intervals.count);
+				s->intervals.segmented = false;
+				s->intervals.count = 0;
+				ret = -EINVAL;
+				break;
+			}
+			/* Direct allocation when exceeding pre-allocated */
+			if (s->intervals.count >= SCMI_MAX_PREALLOC_POOL) {
+				s->intervals.desc =
+					devm_kcalloc(handle->dev,
+						     s->intervals.count,
+						     sizeof(*s->intervals.desc),
+						     GFP_KERNEL);
+				if (!s->intervals.desc) {
+					s->intervals.segmented = false;
+					s->intervals.count = 0;
+					ret = -ENOMEM;
+					break;
+				}
+			}
+		} else if (desc_index + num_returned > s->intervals.count) {
+			dev_err(handle->dev,
+				"No. of update intervals can't exceed %d\n",
+				s->intervals.count);
+			ret = -EINVAL;
+			break;
+		}
+
+		for (cnt = 0; cnt < num_returned; cnt++)
+			s->intervals.desc[desc_index + cnt] =
+					le32_to_cpu(buf->intervals[cnt]);
+
+		desc_index += num_returned;
+
+		scmi_reset_rx_to_maxsz(handle, ti);
+		/*
+		 * check for both returned and remaining to avoid infinite
+		 * loop due to buggy firmware
+		 */
+	} while (num_returned && num_remaining);
+
+	scmi_xfer_put(handle, ti);
+	return ret;
+}
+
+static int scmi_sensor_axis_description(const struct scmi_handle *handle,
+					struct scmi_sensor_info *s)
+{
+	int ret, cnt;
+	u32 desc_index = 0;
+	u16 num_returned, num_remaining;
+	struct scmi_xfer *te;
+	struct scmi_msg_resp_sensor_axis_description *buf;
+	struct scmi_msg_sensor_axis_description_get *msg;
+
+	s->axis = devm_kcalloc(handle->dev, s->num_axis,
+			       sizeof(*s->axis), GFP_KERNEL);
+	if (!s->axis)
+		return -ENOMEM;
+
+	ret = scmi_xfer_get_init(handle, SENSOR_AXIS_DESCRIPTION_GET,
+				 SCMI_PROTOCOL_SENSOR, sizeof(*msg), 0, &te);
+	if (ret)
+		return ret;
+
+	buf = te->rx.buf;
+	do {
+		u32 flags;
+		struct scmi_axis_descriptor *adesc;
+
+		msg = te->tx.buf;
+		/* Set the number of sensors to be skipped/already read */
+		msg->id = cpu_to_le32(s->id);
+		msg->axis_desc_index = cpu_to_le32(desc_index);
+
+		ret = scmi_do_xfer(handle, te);
+		if (ret)
+			break;
+
+		flags = le32_to_cpu(buf->num_axis_flags);
+		num_returned = NUM_AXIS_RETURNED(flags);
+		num_remaining = NUM_AXIS_REMAINING(flags);
+
+		if (desc_index + num_returned > s->num_axis) {
+			dev_err(handle->dev, "No. of axis can't exceed %d\n",
+				s->num_axis);
+			break;
+		}
+
+		adesc = &buf->desc[0];
+		for (cnt = 0; cnt < num_returned; cnt++) {
+			u32 attrh, attrl;
+			struct scmi_sensor_axis_info *a;
+			size_t dsize = SCMI_MSG_RESP_AXIS_DESCR_BASE_SZ;
+
+			attrl = le32_to_cpu(adesc->attributes_low);
+
+			a = &s->axis[desc_index + cnt];
+
+			a->id = le32_to_cpu(adesc->id);
+			a->extended_attrs = SUPPORTS_EXTEND_ATTRS(attrl);
+
+			attrh = le32_to_cpu(adesc->attributes_high);
+			a->scale = S32_EXT(SENSOR_SCALE(attrh));
+			a->type = SENSOR_TYPE(attrh);
+			strlcpy(a->name, adesc->name, SCMI_MAX_STR_SIZE);
+
+			if (a->extended_attrs) {
+				unsigned int ares =
+					le32_to_cpu(adesc->resolution);
+
+				a->resolution = SENSOR_RES(ares);
+				a->exponent =
+					S32_EXT(SENSOR_RES_EXP(ares));
+				dsize += sizeof(adesc->resolution);
+
+				scmi_parse_range_attrs(&a->attrs,
+						       &adesc->attrs);
+				dsize += sizeof(adesc->attrs);
+			}
+
+			adesc = (typeof(adesc))((u8 *)adesc + dsize);
+		}
+
+		desc_index += num_returned;
+
+		scmi_reset_rx_to_maxsz(handle, te);
+		/*
+		 * check for both returned and remaining to avoid infinite
+		 * loop due to buggy firmware
+		 */
+	} while (num_returned && num_remaining);
+
+	scmi_xfer_put(handle, te);
+	return ret;
+}
+
 static int scmi_sensor_description_get(const struct scmi_handle *handle,
 				       struct sensors_info *si)
 {
@@ -131,9 +402,10 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 	buf = t->rx.buf;
 
 	do {
+		struct scmi_sensor_descriptor *sdesc;
+
 		/* Set the number of sensors to be skipped/already read */
 		put_unaligned_le32(desc_index, t->tx.buf);
-
 		ret = scmi_do_xfer(handle, t);
 		if (ret)
 			break;
@@ -147,22 +419,97 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 			break;
 		}
 
+		sdesc = &buf->desc[0];
 		for (cnt = 0; cnt < num_returned; cnt++) {
 			u32 attrh, attrl;
 			struct scmi_sensor_info *s;
+			size_t dsize = SCMI_MSG_RESP_SENS_DESCR_BASE_SZ;
 
-			attrl = le32_to_cpu(buf->desc[cnt].attributes_low);
-			attrh = le32_to_cpu(buf->desc[cnt].attributes_high);
 			s = &si->sensors[desc_index + cnt];
-			s->id = le32_to_cpu(buf->desc[cnt].id);
-			s->type = SENSOR_TYPE(attrh);
-			s->scale = SENSOR_SCALE(attrh);
-			/* Sign extend to a full s8 */
-			if (s->scale & SENSOR_SCALE_SIGN)
-				s->scale |= SENSOR_SCALE_EXTEND;
+			s->id = le32_to_cpu(sdesc->id);
+
+			attrl = le32_to_cpu(sdesc->attributes_low);
+			/* common bitfields parsing */
 			s->async = SUPPORTS_ASYNC_READ(attrl);
 			s->num_trip_points = NUM_TRIP_POINTS(attrl);
-			strlcpy(s->name, buf->desc[cnt].name, SCMI_MAX_STR_SIZE);
+			/**
+			 * only SCMIv3.0 specific bitfield below.
+			 * Such bitfields are assumed to be zeroed on non
+			 * relevant fw versions...assuming fw not buggy !
+			 */
+			s->update = SUPPORTS_UPDATE_NOTIFY(attrl);
+			s->timestamped = SUPPORTS_TIMESTAMP(attrl);
+			if (s->timestamped)
+				s->tstamp_scale =
+					S32_EXT(SENSOR_TSTAMP_EXP(attrl));
+			s->extended_scalar_attrs =
+				SUPPORTS_EXTEND_ATTRS(attrl);
+
+			attrh = le32_to_cpu(sdesc->attributes_high);
+			/* common bitfields parsing */
+			s->scale = S32_EXT(SENSOR_SCALE(attrh));
+			s->type = SENSOR_TYPE(attrh);
+			/* Use pre-allocated pool wherever possible */
+			s->intervals.desc = s->intervals.prealloc_pool;
+			if (si->version == SCMIv2_SENSOR_PROTOCOL) {
+				s->intervals.segmented = false;
+				s->intervals.count = 1;
+				/*
+				 * Convert SCMIv2.0 update interval format to
+				 * SCMIv3.0 to be used as the common exposed
+				 * descriptor, accessible via common macros.
+				 */
+				s->intervals.desc[0] =
+					(SENSOR_UPDATE_BASE(attrh) << 5) |
+					 SENSOR_UPDATE_SCALE(attrh);
+			} else {
+				/*
+				 * From SCMIv3.0 update intervals are retrieved
+				 * via a dedicated (optional) command.
+				 * Since the command is optional, on error carry
+				 * on without any update interval.
+				 */
+				if (scmi_sensor_update_intervals(handle, s))
+					dev_dbg(handle->dev,
+						"Update Intervals not available for sensor ID:%d\n",
+						s->id);
+			}
+			/**
+			 * only > SCMIv2.0 specific bitfield below.
+			 * Such bitfields are assumed to be zeroed on non
+			 * relevant fw versions...assuming fw not buggy !
+			 */
+			s->num_axis = min_t(unsigned int,
+					    SUPPORTS_AXIS(attrh) ?
+					    SENSOR_AXIS_NUMBER(attrh) : 0,
+					    SCMI_MAX_NUM_SENSOR_AXIS);
+			strlcpy(s->name, sdesc->name, SCMI_MAX_STR_SIZE);
+
+			if (s->extended_scalar_attrs) {
+				s->sensor_power = le32_to_cpu(sdesc->power);
+				dsize += sizeof(sdesc->power);
+				/* Only for sensors reporting scalar values */
+				if (s->num_axis == 0) {
+					unsigned int sres =
+						le32_to_cpu(sdesc->resolution);
+
+					s->resolution = SENSOR_RES(sres);
+					s->exponent =
+						S32_EXT(SENSOR_RES_EXP(sres));
+					dsize += sizeof(sdesc->resolution);
+
+					scmi_parse_range_attrs(&s->scalar_attrs,
+							       &sdesc->scalar_attrs);
+					dsize += sizeof(sdesc->scalar_attrs);
+				}
+			}
+			if (s->num_axis > 0) {
+				ret = scmi_sensor_axis_description(handle, s);
+				if (ret)
+					goto out;
+			}
+
+			sdesc = (typeof(sdesc))((u8 *)sdesc + dsize);
 		}
 
 		desc_index += num_returned;
@@ -174,6 +521,7 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 		 */
 	} while (num_returned && num_remaining);
 
+out:
 	scmi_xfer_put(handle, t);
 	return ret;
 }
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 13d75956aa91..0792b0be25a3 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -8,6 +8,7 @@
 #ifndef _LINUX_SCMI_PROTOCOL_H
 #define _LINUX_SCMI_PROTOCOL_H
 
+#include <linux/bitfield.h>
 #include <linux/device.h>
 #include <linux/notifier.h>
 #include <linux/types.h>
@@ -148,13 +149,135 @@ struct scmi_power_ops {
 			 u32 *state);
 };
 
+/**
+ * scmi_range_attrs  - specifies a sensor or axis values' range
+ * @min_range: The minimum value which can be represented by the sensor/axis.
+ * @max_range: The maximum value which can be represented by the sensor/axis.
+ */
+struct scmi_range_attrs {
+	long long min_range;
+	long long max_range;
+};
+
+/**
+ * scmi_sensor_axis_info  - describes one sensor axes
+ * @id: The axes ID.
+ * @type: Axes type. Chosen amongst one of @enum scmi_sensor_class.
+ * @scale: Power-of-10 multiplier applied to the axis unit.
+ * @name: NULL-terminated string representing axes name as advertised by
+ *	  SCMI platform.
+ * @extended_attrs: Flag to indicate the presence of additional extended
+ *		    attributes for this axes.
+ * @resolution: Extended attribute representing the resolution of the axes.
+ *		Set to 0 if not reported by this axes.
+ * @exponent: Extended attribute representing the power-of-10 multiplier that
+ *	      is applied to the resolution field. Set to 0 if not reported by
+ *	      this axes.
+ * @attrs: Extended attributes representing minimum and maximum values
+ *	   measurable by this axes. Set to 0 if not reported by this sensor.
+ */
+struct scmi_sensor_axis_info {
+	unsigned int id;
+	unsigned int type;
+	int scale;
+	char name[SCMI_MAX_STR_SIZE];
+	bool extended_attrs;
+	unsigned int resolution;
+	int exponent;
+	struct scmi_range_attrs attrs;
+};
+
+/**
+ * scmi_sensor_intervals_info  - describes number and type of available update
+ * intervals
+ * @segmented: Flag for segmented intervals' representation. When True there
+ *	       will be exactly 3 intervals in @desc, with each entry
+ *	       representing a member of a segment in this order:
+ *	       {lowest update interval, highest update interval, step size}
+ * @count: Number of intervals described in @desc.
+ * @desc: Array of @count interval descriptor bitmask represented as detailed in
+ *	  the SCMI specification: it can be accessed using the accompanying
+ *	  macros.
+ * @prealloc_pool: A minimal preallocated pool of desc entries used to avoid
+ *		   lesser-than-64-bytes dynamic allocation for small @count
+ *		   values.
+ */
+struct scmi_sensor_intervals_info {
+	bool segmented;
+	unsigned int count;
+#define SCMI_SENS_INTVL_SEGMENT_LOW	0
+#define SCMI_SENS_INTVL_SEGMENT_HIGH	1
+#define SCMI_SENS_INTVL_SEGMENT_STEP	2
+	unsigned int *desc;
+#define SCMI_SENS_INTVL_GET_SECS(x)		FIELD_GET(GENMASK(20, 5), (x))
+#define SCMI_SENS_INTVL_GET_EXP(x)					\
+	({								\
+		int __signed_exp = FIELD_GET(GENMASK(4, 0), (x));	\
+									\
+		if (__signed_exp & BIT(4))				\
+			__signed_exp |= GENMASK(31, 5);			\
+		__signed_exp;						\
+	})
+#define SCMI_MAX_PREALLOC_POOL			16
+	unsigned int prealloc_pool[SCMI_MAX_PREALLOC_POOL];
+};
+
+/**
+ * struct scmi_sensor_info - represents information related to one of the
+ * available sensors.
+ * @id: Sensor ID.
+ * @type: Sensor type. Chosen amongst one of @enum scmi_sensor_class.
+ * @scale: Power-of-10 multiplier applied to the sensor unit.
+ * @num_trip_points: Number of maximum configurable trip points.
+ * @async: Flag for asynchronous read support.
+ * @update: Flag for continuouos update notification support.
+ * @timestamped: Flag for timestamped read support.
+ * @tstamp_scale: Power-of-10 multiplier applied to the sensor timestamps to
+ *		  represent it in seconds.
+ * @num_axis: Number of supported axis if any. Reported as 0 for scalar sensors.
+ * @axis: Pointer to an array of @num_axis descriptors.
+ * @intervals: Descriptor of available update intervals.
+ * @sensor_config: A bitmask reporting the current sensor configuration as
+ *		   detailed in the SCMI specification: it can accessed and
+ *		   modified through the accompanying macros.
+ * @name: NULL-terminated string representing sensor name as advertised by
+ *	  SCMI platform.
+ * @extended_scalar_attrs: Flag to indicate the presence of additional extended
+ *			   attributes for this sensor.
+ * @sensor_power: Extended attribute representing the average power
+ *		  consumed by the sensor in microwatts (uW) when it is active.
+ *		  Reported here only for scalar sensors.
+ *		  Set to 0 if not reported by this sensor.
+ * @resolution: Extended attribute representing the resolution of the sensor.
+ *		Reported here only for scalar sensors.
+ *		Set to 0 if not reported by this sensor.
+ * @exponent: Extended attribute representing the power-of-10 multiplier that is
+ *	      applied to the resolution field.
+ *	      Reported here only for scalar sensors.
+ *	      Set to 0 if not reported by this sensor.
+ * @scalar_attrs: Extended attributes representing minimum and maximum
+ *		  measurable values by this sensor.
+ *		  Reported here only for scalar sensors.
+ *		  Set to 0 if not reported by this sensor.
+ */
 struct scmi_sensor_info {
-	u32 id;
-	u8 type;
-	s8 scale;
-	u8 num_trip_points;
+	unsigned int id;
+	unsigned int type;
+	int scale;
+	unsigned int num_trip_points;
 	bool async;
+	bool update;
+	bool timestamped;
+	int tstamp_scale;
+	unsigned int num_axis;
+	struct scmi_sensor_axis_info *axis;
+	struct scmi_sensor_intervals_info intervals;
 	char name[SCMI_MAX_STR_SIZE];
+	bool extended_scalar_attrs;
+	unsigned int sensor_power;
+	unsigned int resolution;
+	int exponent;
+	struct scmi_range_attrs scalar_attrs;
 };
 
 /*
@@ -249,6 +372,14 @@ enum scmi_sensor_class {
 	SQ_FEET = 0x54,
 	SQ_CM = 0x55,
 	SQ_METERS = 0x56,
+	RADIANS_SEC = 0x57,
+	BPM = 0x58,
+	METERS_SEC_SQUARED = 0x59,
+	METERS_SEC = 0x5A,
+	CUBIC_METERS_SEC = 0x5B,
+	MM_MERCURY = 0x5C,
+	RADIANS_SEC_SQUARED = 0x5D,
+	OEM_UNIT = 0xFF
 };
 
 /**
-- 
cgit v1.2.3


From e2083d36739168f7b612312160cf7bb45b251408 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Thu, 19 Nov 2020 17:49:04 +0000
Subject: firmware: arm_scmi: Add SCMI v3.0 sensors timestamped reads

Add new .reading_get_timestamped() method to sensor_ops to support SCMI v3.0
timestamped reads.

Link: https://lore.kernel.org/r/20201119174906.43862-5-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 127 ++++++++++++++++++++++++++++++++++--
 include/linux/scmi_protocol.h       |  22 +++++++
 2 files changed, 143 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index a85827f60a02..2239af5f9e6e 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -155,6 +155,23 @@ struct scmi_msg_sensor_reading_get {
 #define SENSOR_READ_ASYNC	BIT(0)
 };
 
+struct scmi_resp_sensor_reading_complete {
+	__le32 id;
+	__le64 readings;
+};
+
+struct scmi_sensor_reading_le {
+	__le32 sensor_value_low;
+	__le32 sensor_value_high;
+	__le32 timestamp_low;
+	__le32 timestamp_high;
+};
+
+struct scmi_resp_sensor_reading_complete_v3 {
+	__le32 id;
+	struct scmi_sensor_reading_le readings[];
+};
+
 struct scmi_sensor_trip_notify_payld {
 	__le32 agent_id;
 	__le32 sensor_id;
@@ -575,6 +592,21 @@ scmi_sensor_trip_point_config(const struct scmi_handle *handle, u32 sensor_id,
 	return ret;
 }
 
+/**
+ * scmi_sensor_reading_get  - Read scalar sensor value
+ * @handle: Platform handle
+ * @sensor_id: Sensor ID
+ * @value: The 64bit value sensor reading
+ *
+ * This function returns a single 64 bit reading value representing the sensor
+ * value; if the platform SCMI Protocol implementation and the sensor support
+ * multiple axis and timestamped-reads, this just returns the first axis while
+ * dropping the timestamp value.
+ * Use instead the @scmi_sensor_reading_get_timestamped to retrieve the array of
+ * timestamped multi-axis values.
+ *
+ * Return: 0 on Success
+ */
 static int scmi_sensor_reading_get(const struct scmi_handle *handle,
 				   u32 sensor_id, u64 *value)
 {
@@ -585,20 +617,24 @@ static int scmi_sensor_reading_get(const struct scmi_handle *handle,
 	struct scmi_sensor_info *s = si->sensors + sensor_id;
 
 	ret = scmi_xfer_get_init(handle, SENSOR_READING_GET,
-				 SCMI_PROTOCOL_SENSOR, sizeof(*sensor),
-				 sizeof(u64), &t);
+				 SCMI_PROTOCOL_SENSOR, sizeof(*sensor), 0, &t);
 	if (ret)
 		return ret;
 
 	sensor = t->tx.buf;
 	sensor->id = cpu_to_le32(sensor_id);
-
 	if (s->async) {
 		sensor->flags = cpu_to_le32(SENSOR_READ_ASYNC);
 		ret = scmi_do_xfer_with_response(handle, t);
-		if (!ret)
-			*value = get_unaligned_le64((void *)
-						    ((__le32 *)t->rx.buf + 1));
+		if (!ret) {
+			struct scmi_resp_sensor_reading_complete *resp;
+
+			resp = t->rx.buf;
+			if (le32_to_cpu(resp->id) == sensor_id)
+				*value = get_unaligned_le64(&resp->readings);
+			else
+				ret = -EPROTO;
+		}
 	} else {
 		sensor->flags = cpu_to_le32(0);
 		ret = scmi_do_xfer(handle, t);
@@ -610,6 +646,84 @@ static int scmi_sensor_reading_get(const struct scmi_handle *handle,
 	return ret;
 }
 
+static inline void
+scmi_parse_sensor_readings(struct scmi_sensor_reading *out,
+			   const struct scmi_sensor_reading_le *in)
+{
+	out->value = get_unaligned_le64((void *)&in->sensor_value_low);
+	out->timestamp = get_unaligned_le64((void *)&in->timestamp_low);
+}
+
+/**
+ * scmi_sensor_reading_get_timestamped  - Read multiple-axis timestamped values
+ * @handle: Platform handle
+ * @sensor_id: Sensor ID
+ * @count: The length of the provided @readings array
+ * @readings: An array of elements each representing a timestamped per-axis
+ *	      reading of type @struct scmi_sensor_reading.
+ *	      Returned readings are ordered as the @axis descriptors array
+ *	      included in @struct scmi_sensor_info and the max number of
+ *	      returned elements is min(@count, @num_axis); ideally the provided
+ *	      array should be of length @count equal to @num_axis.
+ *
+ * Return: 0 on Success
+ */
+static int
+scmi_sensor_reading_get_timestamped(const struct scmi_handle *handle,
+				    u32 sensor_id, u8 count,
+				    struct scmi_sensor_reading *readings)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_msg_sensor_reading_get *sensor;
+	struct sensors_info *si = handle->sensor_priv;
+	struct scmi_sensor_info *s = si->sensors + sensor_id;
+
+	if (!count || !readings ||
+	    (!s->num_axis && count > 1) || (s->num_axis && count > s->num_axis))
+		return -EINVAL;
+
+	ret = scmi_xfer_get_init(handle, SENSOR_READING_GET,
+				 SCMI_PROTOCOL_SENSOR, sizeof(*sensor), 0, &t);
+	if (ret)
+		return ret;
+
+	sensor = t->tx.buf;
+	sensor->id = cpu_to_le32(sensor_id);
+	if (s->async) {
+		sensor->flags = cpu_to_le32(SENSOR_READ_ASYNC);
+		ret = scmi_do_xfer_with_response(handle, t);
+		if (!ret) {
+			int i;
+			struct scmi_resp_sensor_reading_complete_v3 *resp;
+
+			resp = t->rx.buf;
+			/* Retrieve only the number of requested axis anyway */
+			if (le32_to_cpu(resp->id) == sensor_id)
+				for (i = 0; i < count; i++)
+					scmi_parse_sensor_readings(&readings[i],
+								   &resp->readings[i]);
+			else
+				ret = -EPROTO;
+		}
+	} else {
+		sensor->flags = cpu_to_le32(0);
+		ret = scmi_do_xfer(handle, t);
+		if (!ret) {
+			int i;
+			struct scmi_sensor_reading_le *resp_readings;
+
+			resp_readings = t->rx.buf;
+			for (i = 0; i < count; i++)
+				scmi_parse_sensor_readings(&readings[i],
+							   &resp_readings[i]);
+		}
+	}
+
+	scmi_xfer_put(handle, t);
+	return ret;
+}
+
 static const struct scmi_sensor_info *
 scmi_sensor_info_get(const struct scmi_handle *handle, u32 sensor_id)
 {
@@ -630,6 +744,7 @@ static const struct scmi_sensor_ops sensor_ops = {
 	.info_get = scmi_sensor_info_get,
 	.trip_point_config = scmi_sensor_trip_point_config,
 	.reading_get = scmi_sensor_reading_get,
+	.reading_get_timestamped = scmi_sensor_reading_get_timestamped,
 };
 
 static int scmi_sensor_set_notify_enabled(const struct scmi_handle *handle,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 0792b0be25a3..0c52bf0cbee4 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -149,6 +149,20 @@ struct scmi_power_ops {
 			 u32 *state);
 };
 
+/**
+ * scmi_sensor_reading  - represent a timestamped read
+ *
+ * Used by @reading_get_timestamped method.
+ *
+ * @value: The signed value sensor read.
+ * @timestamp: An unsigned timestamp for the sensor read, as provided by
+ *	       SCMI platform. Set to zero when not available.
+ */
+struct scmi_sensor_reading {
+	long long value;
+	unsigned long long timestamp;
+};
+
 /**
  * scmi_range_attrs  - specifies a sensor or axis values' range
  * @min_range: The minimum value which can be represented by the sensor/axis.
@@ -390,6 +404,11 @@ enum scmi_sensor_class {
  * @info_get: get the information of the specified sensor
  * @trip_point_config: selects and configures a trip-point of interest
  * @reading_get: gets the current value of the sensor
+ * @reading_get_timestamped: gets the current value and timestamp, when
+ *			     available, of the sensor. (as of v3.0 spec)
+ *			     Supports multi-axis sensors for sensors which
+ *			     supports it and if the @reading array size of
+ *			     @count entry equals the sensor num_axis
  */
 struct scmi_sensor_ops {
 	int (*count_get)(const struct scmi_handle *handle);
@@ -399,6 +418,9 @@ struct scmi_sensor_ops {
 				 u32 sensor_id, u8 trip_id, u64 trip_value);
 	int (*reading_get)(const struct scmi_handle *handle, u32 sensor_id,
 			   u64 *value);
+	int (*reading_get_timestamped)(const struct scmi_handle *handle,
+				       u32 sensor_id, u8 count,
+				       struct scmi_sensor_reading *readings);
 };
 
 /**
-- 
cgit v1.2.3


From 7b83c5f41088987d04e24c3af0e1fb9f43b747b5 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Thu, 19 Nov 2020 17:49:05 +0000
Subject: firmware: arm_scmi: Add SCMI v3.0 sensor configuration support

Add SCMI v3.0 sensor support for CONFIG_GET/CONFIG_SET commands.

Link: https://lore.kernel.org/r/20201119174906.43862-6-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 63 +++++++++++++++++++++++++++++++++++++
 include/linux/scmi_protocol.h       | 37 ++++++++++++++++++++++
 2 files changed, 100 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index 2239af5f9e6e..10c271d430e7 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -23,6 +23,8 @@ enum scmi_sensor_protocol_cmd {
 	SENSOR_READING_GET = 0x6,
 	SENSOR_AXIS_DESCRIPTION_GET = 0x7,
 	SENSOR_LIST_UPDATE_INTERVALS = 0x8,
+	SENSOR_CONFIG_GET = 0x9,
+	SENSOR_CONFIG_SET = 0xA,
 };
 
 struct scmi_msg_resp_sensor_attributes {
@@ -149,6 +151,11 @@ struct scmi_msg_set_sensor_trip_point {
 	__le32 value_high;
 };
 
+struct scmi_msg_sensor_config_set {
+	__le32 id;
+	__le32 sensor_config;
+};
+
 struct scmi_msg_sensor_reading_get {
 	__le32 id;
 	__le32 flags;
@@ -592,6 +599,60 @@ scmi_sensor_trip_point_config(const struct scmi_handle *handle, u32 sensor_id,
 	return ret;
 }
 
+static int scmi_sensor_config_get(const struct scmi_handle *handle,
+				  u32 sensor_id, u32 *sensor_config)
+{
+	int ret;
+	struct scmi_xfer *t;
+
+	ret = scmi_xfer_get_init(handle, SENSOR_CONFIG_GET,
+				 SCMI_PROTOCOL_SENSOR, sizeof(__le32),
+				 sizeof(__le32), &t);
+	if (ret)
+		return ret;
+
+	put_unaligned_le32(cpu_to_le32(sensor_id), t->tx.buf);
+	ret = scmi_do_xfer(handle, t);
+	if (!ret) {
+		struct sensors_info *si = handle->sensor_priv;
+		struct scmi_sensor_info *s = si->sensors + sensor_id;
+
+		*sensor_config = get_unaligned_le64(t->rx.buf);
+		s->sensor_config = *sensor_config;
+	}
+
+	scmi_xfer_put(handle, t);
+	return ret;
+}
+
+static int scmi_sensor_config_set(const struct scmi_handle *handle,
+				  u32 sensor_id, u32 sensor_config)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_msg_sensor_config_set *msg;
+
+	ret = scmi_xfer_get_init(handle, SENSOR_CONFIG_SET,
+				 SCMI_PROTOCOL_SENSOR, sizeof(*msg), 0, &t);
+	if (ret)
+		return ret;
+
+	msg = t->tx.buf;
+	msg->id = cpu_to_le32(sensor_id);
+	msg->sensor_config = cpu_to_le32(sensor_config);
+
+	ret = scmi_do_xfer(handle, t);
+	if (!ret) {
+		struct sensors_info *si = handle->sensor_priv;
+		struct scmi_sensor_info *s = si->sensors + sensor_id;
+
+		s->sensor_config = sensor_config;
+	}
+
+	scmi_xfer_put(handle, t);
+	return ret;
+}
+
 /**
  * scmi_sensor_reading_get  - Read scalar sensor value
  * @handle: Platform handle
@@ -745,6 +806,8 @@ static const struct scmi_sensor_ops sensor_ops = {
 	.trip_point_config = scmi_sensor_trip_point_config,
 	.reading_get = scmi_sensor_reading_get,
 	.reading_get_timestamped = scmi_sensor_reading_get_timestamped,
+	.config_get = scmi_sensor_config_get,
+	.config_set = scmi_sensor_config_set,
 };
 
 static int scmi_sensor_set_notify_enabled(const struct scmi_handle *handle,
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 0c52bf0cbee4..7e9e2cd3d46b 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -286,7 +286,38 @@ struct scmi_sensor_info {
 	unsigned int num_axis;
 	struct scmi_sensor_axis_info *axis;
 	struct scmi_sensor_intervals_info intervals;
+	unsigned int sensor_config;
+#define SCMI_SENS_CFG_UPDATE_SECS_MASK		GENMASK(31, 16)
+#define SCMI_SENS_CFG_GET_UPDATE_SECS(x)				\
+	FIELD_GET(SCMI_SENS_CFG_UPDATE_SECS_MASK, (x))
+
+#define SCMI_SENS_CFG_UPDATE_EXP_MASK		GENMASK(15, 11)
+#define SCMI_SENS_CFG_GET_UPDATE_EXP(x)					\
+	({								\
+		int __signed_exp =					\
+			FIELD_GET(SCMI_SENS_CFG_UPDATE_EXP_MASK, (x));	\
+									\
+		if (__signed_exp & BIT(4))				\
+			__signed_exp |= GENMASK(31, 5);			\
+		__signed_exp;						\
+	})
+
+#define SCMI_SENS_CFG_ROUND_MASK		GENMASK(10, 9)
+#define SCMI_SENS_CFG_ROUND_AUTO		2
+#define SCMI_SENS_CFG_ROUND_UP			1
+#define SCMI_SENS_CFG_ROUND_DOWN		0
+
+#define SCMI_SENS_CFG_TSTAMP_ENABLED_MASK	BIT(1)
+#define SCMI_SENS_CFG_TSTAMP_ENABLE		1
+#define SCMI_SENS_CFG_TSTAMP_DISABLE		0
+#define SCMI_SENS_CFG_IS_TSTAMP_ENABLED(x)				\
+	FIELD_GET(SCMI_SENS_CFG_TSTAMP_ENABLED_MASK, (x))
+
+#define SCMI_SENS_CFG_SENSOR_ENABLED_MASK	BIT(0)
+#define SCMI_SENS_CFG_SENSOR_ENABLE		1
+#define SCMI_SENS_CFG_SENSOR_DISABLE		0
 	char name[SCMI_MAX_STR_SIZE];
+#define SCMI_SENS_CFG_IS_ENABLED(x)		FIELD_GET(BIT(0), (x))
 	bool extended_scalar_attrs;
 	unsigned int sensor_power;
 	unsigned int resolution;
@@ -409,6 +440,8 @@ enum scmi_sensor_class {
  *			     Supports multi-axis sensors for sensors which
  *			     supports it and if the @reading array size of
  *			     @count entry equals the sensor num_axis
+ * @config_get: Get sensor current configuration
+ * @config_set: Set sensor current configuration
  */
 struct scmi_sensor_ops {
 	int (*count_get)(const struct scmi_handle *handle);
@@ -421,6 +454,10 @@ struct scmi_sensor_ops {
 	int (*reading_get_timestamped)(const struct scmi_handle *handle,
 				       u32 sensor_id, u8 count,
 				       struct scmi_sensor_reading *readings);
+	int (*config_get)(const struct scmi_handle *handle,
+			  u32 sensor_id, u32 *sensor_config);
+	int (*config_set)(const struct scmi_handle *handle,
+			  u32 sensor_id, u32 sensor_config);
 };
 
 /**
-- 
cgit v1.2.3


From e3811190acf85c63518fbddaa28bcbfab2baa58d Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Thu, 19 Nov 2020 17:49:06 +0000
Subject: firmware: arm_scmi: Add SCMI v3.0 sensor notifications

Add support for new SCMI v3.0 SENSOR_UPDATE notification.

Link: https://lore.kernel.org/r/20201119174906.43862-7-cristian.marussi@arm.com
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 124 ++++++++++++++++++++++++++++++------
 include/linux/scmi_protocol.h       |   9 +++
 2 files changed, 114 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index 10c271d430e7..b3d7c08c09a0 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -25,6 +25,7 @@ enum scmi_sensor_protocol_cmd {
 	SENSOR_LIST_UPDATE_INTERVALS = 0x8,
 	SENSOR_CONFIG_GET = 0x9,
 	SENSOR_CONFIG_SET = 0xA,
+	SENSOR_CONTINUOUS_UPDATE_NOTIFY = 0xB,
 };
 
 struct scmi_msg_resp_sensor_attributes {
@@ -132,10 +133,10 @@ struct scmi_msg_resp_sensor_list_update_intervals {
 	__le32 intervals[];
 };
 
-struct scmi_msg_sensor_trip_point_notify {
+struct scmi_msg_sensor_request_notify {
 	__le32 id;
 	__le32 event_control;
-#define SENSOR_TP_NOTIFY_ALL	BIT(0)
+#define SENSOR_NOTIFY_ALL	BIT(0)
 };
 
 struct scmi_msg_set_sensor_trip_point {
@@ -185,6 +186,12 @@ struct scmi_sensor_trip_notify_payld {
 	__le32 trip_point_desc;
 };
 
+struct scmi_sensor_update_notify_payld {
+	__le32 agent_id;
+	__le32 sensor_id;
+	struct scmi_sensor_reading_le readings[];
+};
+
 struct sensors_info {
 	u32 version;
 	int num_sensors;
@@ -550,15 +557,16 @@ out:
 	return ret;
 }
 
-static int scmi_sensor_trip_point_notify(const struct scmi_handle *handle,
-					 u32 sensor_id, bool enable)
+static inline int
+scmi_sensor_request_notify(const struct scmi_handle *handle, u32 sensor_id,
+			   u8 message_id, bool enable)
 {
 	int ret;
-	u32 evt_cntl = enable ? SENSOR_TP_NOTIFY_ALL : 0;
+	u32 evt_cntl = enable ? SENSOR_NOTIFY_ALL : 0;
 	struct scmi_xfer *t;
-	struct scmi_msg_sensor_trip_point_notify *cfg;
+	struct scmi_msg_sensor_request_notify *cfg;
 
-	ret = scmi_xfer_get_init(handle, SENSOR_TRIP_POINT_NOTIFY,
+	ret = scmi_xfer_get_init(handle, message_id,
 				 SCMI_PROTOCOL_SENSOR, sizeof(*cfg), 0, &t);
 	if (ret)
 		return ret;
@@ -573,6 +581,23 @@ static int scmi_sensor_trip_point_notify(const struct scmi_handle *handle,
 	return ret;
 }
 
+static int scmi_sensor_trip_point_notify(const struct scmi_handle *handle,
+					 u32 sensor_id, bool enable)
+{
+	return scmi_sensor_request_notify(handle, sensor_id,
+					  SENSOR_TRIP_POINT_NOTIFY,
+					  enable);
+}
+
+static int
+scmi_sensor_continuous_update_notify(const struct scmi_handle *handle,
+				     u32 sensor_id, bool enable)
+{
+	return scmi_sensor_request_notify(handle, sensor_id,
+					  SENSOR_CONTINUOUS_UPDATE_NOTIFY,
+					  enable);
+}
+
 static int
 scmi_sensor_trip_point_config(const struct scmi_handle *handle, u32 sensor_id,
 			      u8 trip_id, u64 trip_value)
@@ -815,7 +840,19 @@ static int scmi_sensor_set_notify_enabled(const struct scmi_handle *handle,
 {
 	int ret;
 
-	ret = scmi_sensor_trip_point_notify(handle, src_id, enable);
+	switch (evt_id) {
+	case SCMI_EVENT_SENSOR_TRIP_POINT_EVENT:
+		ret = scmi_sensor_trip_point_notify(handle, src_id, enable);
+		break;
+	case SCMI_EVENT_SENSOR_UPDATE:
+		ret = scmi_sensor_continuous_update_notify(handle, src_id,
+							   enable);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
 	if (ret)
 		pr_debug("FAIL_ENABLED - evt[%X] dom[%d] - ret:%d\n",
 			 evt_id, src_id, ret);
@@ -828,20 +865,59 @@ static void *scmi_sensor_fill_custom_report(const struct scmi_handle *handle,
 					    const void *payld, size_t payld_sz,
 					    void *report, u32 *src_id)
 {
-	const struct scmi_sensor_trip_notify_payld *p = payld;
-	struct scmi_sensor_trip_point_report *r = report;
+	void *rep = NULL;
 
-	if (evt_id != SCMI_EVENT_SENSOR_TRIP_POINT_EVENT ||
-	    sizeof(*p) != payld_sz)
-		return NULL;
+	switch (evt_id) {
+	case SCMI_EVENT_SENSOR_TRIP_POINT_EVENT:
+	{
+		const struct scmi_sensor_trip_notify_payld *p = payld;
+		struct scmi_sensor_trip_point_report *r = report;
 
-	r->timestamp = timestamp;
-	r->agent_id = le32_to_cpu(p->agent_id);
-	r->sensor_id = le32_to_cpu(p->sensor_id);
-	r->trip_point_desc = le32_to_cpu(p->trip_point_desc);
-	*src_id = r->sensor_id;
+		if (sizeof(*p) != payld_sz)
+			break;
 
-	return r;
+		r->timestamp = timestamp;
+		r->agent_id = le32_to_cpu(p->agent_id);
+		r->sensor_id = le32_to_cpu(p->sensor_id);
+		r->trip_point_desc = le32_to_cpu(p->trip_point_desc);
+		*src_id = r->sensor_id;
+		rep = r;
+		break;
+	}
+	case SCMI_EVENT_SENSOR_UPDATE:
+	{
+		int i;
+		struct scmi_sensor_info *s;
+		const struct scmi_sensor_update_notify_payld *p = payld;
+		struct scmi_sensor_update_report *r = report;
+		struct sensors_info *sinfo = handle->sensor_priv;
+
+		/* payld_sz is variable for this event */
+		r->sensor_id = le32_to_cpu(p->sensor_id);
+		if (r->sensor_id >= sinfo->num_sensors)
+			break;
+		r->timestamp = timestamp;
+		r->agent_id = le32_to_cpu(p->agent_id);
+		s = &sinfo->sensors[r->sensor_id];
+		/*
+		 * The generated report r (@struct scmi_sensor_update_report)
+		 * was pre-allocated to contain up to SCMI_MAX_NUM_SENSOR_AXIS
+		 * readings: here it is filled with the effective @num_axis
+		 * readings defined for this sensor or 1 for scalar sensors.
+		 */
+		r->readings_count = s->num_axis ?: 1;
+		for (i = 0; i < r->readings_count; i++)
+			scmi_parse_sensor_readings(&r->readings[i],
+						   &p->readings[i]);
+		*src_id = r->sensor_id;
+		rep = r;
+		break;
+	}
+	default:
+		break;
+	}
+
+	return rep;
 }
 
 static const struct scmi_event sensor_events[] = {
@@ -850,6 +926,16 @@ static const struct scmi_event sensor_events[] = {
 		.max_payld_sz = sizeof(struct scmi_sensor_trip_notify_payld),
 		.max_report_sz = sizeof(struct scmi_sensor_trip_point_report),
 	},
+	{
+		.id = SCMI_EVENT_SENSOR_UPDATE,
+		.max_payld_sz =
+			sizeof(struct scmi_sensor_update_notify_payld) +
+			 SCMI_MAX_NUM_SENSOR_AXIS *
+			 sizeof(struct scmi_sensor_reading_le),
+		.max_report_sz = sizeof(struct scmi_sensor_update_report) +
+				  SCMI_MAX_NUM_SENSOR_AXIS *
+				  sizeof(struct scmi_sensor_reading),
+	},
 };
 
 static const struct scmi_event_ops sensor_event_ops = {
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 7e9e2cd3d46b..be0be5ff7514 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -657,6 +657,7 @@ enum scmi_notification_events {
 	SCMI_EVENT_PERFORMANCE_LIMITS_CHANGED = 0x0,
 	SCMI_EVENT_PERFORMANCE_LEVEL_CHANGED = 0x1,
 	SCMI_EVENT_SENSOR_TRIP_POINT_EVENT = 0x0,
+	SCMI_EVENT_SENSOR_UPDATE = 0x1,
 	SCMI_EVENT_RESET_ISSUED = 0x0,
 	SCMI_EVENT_BASE_ERROR_EVENT = 0x0,
 	SCMI_EVENT_SYSTEM_POWER_STATE_NOTIFIER = 0x0,
@@ -698,6 +699,14 @@ struct scmi_sensor_trip_point_report {
 	unsigned int	trip_point_desc;
 };
 
+struct scmi_sensor_update_report {
+	ktime_t				timestamp;
+	unsigned int			agent_id;
+	unsigned int			sensor_id;
+	unsigned int			readings_count;
+	struct scmi_sensor_reading	readings[];
+};
+
 struct scmi_reset_issued_report {
 	ktime_t		timestamp;
 	unsigned int	agent_id;
-- 
cgit v1.2.3


From e44cdff05145b84293e3f424daa17e4f3ce0109c Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Thu, 19 Nov 2020 17:45:09 +0100
Subject: clk: samsung: Allow compile testing of Exynos, S3C64xx and S5Pv210

So far all Exynos, S3C64xx and S5Pv210 clock units were selected by
respective SOC/ARCH Kconfig option.  On a kernel built for selected
SoCs, this allowed to build only limited set of matching clock drivers.
However compile testing was not possible in such case as Makefile object
depends on SOC/ARCH option.

Add separate Kconfig options for each of them to be able to compile
test.

Link: https://lore.kernel.org/r/20201119164509.754851-1-krzk@kernel.org
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
---
 drivers/clk/samsung/Kconfig  | 67 ++++++++++++++++++++++++++++++++++++++++++--
 drivers/clk/samsung/Makefile | 22 +++++++--------
 include/linux/clk/samsung.h  |  4 +--
 3 files changed, 78 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/samsung/Kconfig b/drivers/clk/samsung/Kconfig
index 57d4b3f20417..9323fcfac6cc 100644
--- a/drivers/clk/samsung/Kconfig
+++ b/drivers/clk/samsung/Kconfig
@@ -2,10 +2,73 @@
 # Recent Exynos platforms should just select COMMON_CLK_SAMSUNG:
 config COMMON_CLK_SAMSUNG
 	bool "Samsung Exynos clock controller support" if COMPILE_TEST
-	# Clocks on ARM64 SoCs (e.g. Exynos5433, Exynos7) are chosen by
-	# EXYNOS_ARM64_COMMON_CLK to avoid building them on ARMv7:
+	select S3C64XX_COMMON_CLK if ARM && ARCH_S3C64XX
+	select S5PV210_COMMON_CLK if ARM && ARCH_S5PV210
+	select EXYNOS_3250_COMMON_CLK if ARM && SOC_EXYNOS3250
+	select EXYNOS_4_COMMON_CLK if ARM && ARCH_EXYNOS4
+	select EXYNOS_5250_COMMON_CLK if ARM && SOC_EXYNOS5250
+	select EXYNOS_5260_COMMON_CLK if ARM && SOC_EXYNOS5260
+	select EXYNOS_5410_COMMON_CLK if ARM && SOC_EXYNOS5410
+	select EXYNOS_5420_COMMON_CLK if ARM && SOC_EXYNOS5420
 	select EXYNOS_ARM64_COMMON_CLK if ARM64 && ARCH_EXYNOS
 
+config S3C64XX_COMMON_CLK
+	bool "Samsung S3C64xx clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung S3C64xx SoCs.
+	  Choose Y here only if you build for this SoC.
+
+config S5PV210_COMMON_CLK
+	bool "Samsung S5Pv210 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung S5Pv210 SoCs.
+	  Choose Y here only if you build for this SoC.
+
+config EXYNOS_3250_COMMON_CLK
+	bool "Samsung Exynos3250 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung
+	  Exynos3250 SoCs. Choose Y here only if you build for this SoC.
+
+config EXYNOS_4_COMMON_CLK
+	bool "Samsung Exynos4 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung
+	  Exynos4212 and Exynos4412 SoCs. Choose Y here only if you build for
+	  this SoC.
+
+config EXYNOS_5250_COMMON_CLK
+	bool "Samsung Exynos5250 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung
+	  Exynos5250 SoCs. Choose Y here only if you build for this SoC.
+
+config EXYNOS_5260_COMMON_CLK
+	bool "Samsung Exynos5260 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung
+	  Exynos5260 SoCs. Choose Y here only if you build for this SoC.
+
+config EXYNOS_5410_COMMON_CLK
+	bool "Samsung Exynos5410 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung
+	  Exynos5410 SoCs. Choose Y here only if you build for this SoC.
+
+config EXYNOS_5420_COMMON_CLK
+	bool "Samsung Exynos5420 clock controller support" if COMPILE_TEST
+	depends on COMMON_CLK_SAMSUNG
+	help
+	  Support for the clock controller present on the Samsung
+	  Exynos5420 SoCs. Choose Y here only if you build for this SoC.
+
 config EXYNOS_ARM64_COMMON_CLK
 	bool "Samsung Exynos ARMv8-family clock controller support" if COMPILE_TEST
 	depends on COMMON_CLK_SAMSUNG
diff --git a/drivers/clk/samsung/Makefile b/drivers/clk/samsung/Makefile
index 1a4e6b787978..bb1433f11c88 100644
--- a/drivers/clk/samsung/Makefile
+++ b/drivers/clk/samsung/Makefile
@@ -4,15 +4,15 @@
 #
 
 obj-$(CONFIG_COMMON_CLK)	+= clk.o clk-pll.o clk-cpu.o
-obj-$(CONFIG_SOC_EXYNOS3250)	+= clk-exynos3250.o
-obj-$(CONFIG_ARCH_EXYNOS4)	+= clk-exynos4.o
-obj-$(CONFIG_ARCH_EXYNOS4)	+= clk-exynos4412-isp.o
-obj-$(CONFIG_SOC_EXYNOS5250)	+= clk-exynos5250.o
-obj-$(CONFIG_SOC_EXYNOS5250)	+= clk-exynos5-subcmu.o
-obj-$(CONFIG_SOC_EXYNOS5260)	+= clk-exynos5260.o
-obj-$(CONFIG_SOC_EXYNOS5410)	+= clk-exynos5410.o
-obj-$(CONFIG_SOC_EXYNOS5420)	+= clk-exynos5420.o
-obj-$(CONFIG_SOC_EXYNOS5420)	+= clk-exynos5-subcmu.o
+obj-$(CONFIG_EXYNOS_3250_COMMON_CLK)	+= clk-exynos3250.o
+obj-$(CONFIG_EXYNOS_4_COMMON_CLK)	+= clk-exynos4.o
+obj-$(CONFIG_EXYNOS_4_COMMON_CLK)	+= clk-exynos4412-isp.o
+obj-$(CONFIG_EXYNOS_5250_COMMON_CLK)	+= clk-exynos5250.o
+obj-$(CONFIG_EXYNOS_5250_COMMON_CLK)	+= clk-exynos5-subcmu.o
+obj-$(CONFIG_EXYNOS_5260_COMMON_CLK)	+= clk-exynos5260.o
+obj-$(CONFIG_EXYNOS_5410_COMMON_CLK)	+= clk-exynos5410.o
+obj-$(CONFIG_EXYNOS_5420_COMMON_CLK)	+= clk-exynos5420.o
+obj-$(CONFIG_EXYNOS_5420_COMMON_CLK)	+= clk-exynos5-subcmu.o
 obj-$(CONFIG_EXYNOS_ARM64_COMMON_CLK)	+= clk-exynos5433.o
 obj-$(CONFIG_EXYNOS_AUDSS_CLK_CON) += clk-exynos-audss.o
 obj-$(CONFIG_ARCH_EXYNOS)	+= clk-exynos-clkout.o
@@ -21,5 +21,5 @@ obj-$(CONFIG_S3C2410_COMMON_CLK)+= clk-s3c2410.o
 obj-$(CONFIG_S3C2410_COMMON_DCLK)+= clk-s3c2410-dclk.o
 obj-$(CONFIG_S3C2412_COMMON_CLK)+= clk-s3c2412.o
 obj-$(CONFIG_S3C2443_COMMON_CLK)+= clk-s3c2443.o
-obj-$(CONFIG_ARCH_S3C64XX)	+= clk-s3c64xx.o
-obj-$(CONFIG_ARCH_S5PV210)	+= clk-s5pv210.o clk-s5pv210-audss.o
+obj-$(CONFIG_S3C64XX_COMMON_CLK)	+= clk-s3c64xx.o
+obj-$(CONFIG_S5PV210_COMMON_CLK)	+= clk-s5pv210.o clk-s5pv210-audss.o
diff --git a/include/linux/clk/samsung.h b/include/linux/clk/samsung.h
index 79097e365f7f..38b774001712 100644
--- a/include/linux/clk/samsung.h
+++ b/include/linux/clk/samsung.h
@@ -10,7 +10,7 @@
 
 struct device_node;
 
-#ifdef CONFIG_ARCH_S3C64XX
+#ifdef CONFIG_S3C64XX_COMMON_CLK
 void s3c64xx_clk_init(struct device_node *np, unsigned long xtal_f,
 		      unsigned long xusbxti_f, bool s3c6400,
 		      void __iomem *base);
@@ -19,7 +19,7 @@ static inline void s3c64xx_clk_init(struct device_node *np,
 				    unsigned long xtal_f,
 				    unsigned long xusbxti_f,
 				    bool s3c6400, void __iomem *base) { }
-#endif /* CONFIG_ARCH_S3C64XX */
+#endif /* CONFIG_S3C64XX_COMMON_CLK */
 
 #ifdef CONFIG_S3C2410_COMMON_CLK
 void s3c2410_common_clk_init(struct device_node *np, unsigned long xti_f,
-- 
cgit v1.2.3


From e7bbb7acabf47d74672e0e314bed4d452d2097b4 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Mon, 9 Nov 2020 14:24:49 +0530
Subject: dmaengine: add peripheral configuration

Some complex dmaengine controllers have capability to program the
peripheral device, so pass on the peripheral configuration as part of
dma_slave_config

Link: https://lore.kernel.org/r/20201109085450.24843-3-vkoul@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index dd357a747780..493a047ed0a2 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -418,6 +418,9 @@ enum dma_slave_buswidth {
  * @slave_id: Slave requester id. Only valid for slave channels. The dma
  * slave peripheral will have unique id as dma requester which need to be
  * pass as slave config.
+ * @peripheral_config: peripheral configuration for programming peripheral
+ * for dmaengine transfer
+ * @peripheral_size: peripheral configuration buffer size
  *
  * This struct is passed in as configuration data to a DMA engine
  * in order to set up a certain channel for DMA transport at runtime.
@@ -443,6 +446,8 @@ struct dma_slave_config {
 	u32 dst_port_window_size;
 	bool device_fc;
 	unsigned int slave_id;
+	void *peripheral_config;
+	size_t peripheral_size;
 };
 
 /**
-- 
cgit v1.2.3


From 5d0c3533a19f48e5e7e73806a3e4b29cd4364130 Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Mon, 9 Nov 2020 14:24:50 +0530
Subject: dmaengine: qcom: Add GPI dma driver

This controller provides DMAengine capabilities for a variety of peripheral
buses such as I2C, UART, and SPI. By using GPI dmaengine driver, bus
drivers can use a standardize interface that is protocol independent to
transfer data between memory and peripheral.

Link: https://lore.kernel.org/r/20201109085450.24843-4-vkoul@kernel.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/qcom/Kconfig         |   12 +
 drivers/dma/qcom/Makefile        |    1 +
 drivers/dma/qcom/gpi.c           | 2303 ++++++++++++++++++++++++++++++++++++++
 include/linux/dma/qcom-gpi-dma.h |   83 ++
 4 files changed, 2399 insertions(+)
 create mode 100644 drivers/dma/qcom/gpi.c
 create mode 100644 include/linux/dma/qcom-gpi-dma.h

(limited to 'include/linux')

diff --git a/drivers/dma/qcom/Kconfig b/drivers/dma/qcom/Kconfig
index 0389d60d2604..365f94eb3b08 100644
--- a/drivers/dma/qcom/Kconfig
+++ b/drivers/dma/qcom/Kconfig
@@ -19,6 +19,18 @@ config QCOM_BAM_DMA
 	  Enable support for the QCOM BAM DMA controller.  This controller
 	  provides DMA capabilities for a variety of on-chip devices.
 
+config QCOM_GPI_DMA
+        tristate "Qualcomm Technologies GPI DMA support"
+        depends on ARCH_QCOM
+        select DMA_ENGINE
+        select DMA_VIRTUAL_CHANNELS
+        help
+          Enable support for the QCOM GPI DMA controller. This controller
+          provides DMA capabilities for a variety of peripheral buses such
+          as I2C, UART, and SPI. By using GPI dmaengine driver, bus drivers
+          can use a standardize interface that is protocol independent to
+          transfer data between DDR and peripheral.
+
 config QCOM_HIDMA_MGMT
 	tristate "Qualcomm Technologies HIDMA Management support"
 	select DMA_ENGINE
diff --git a/drivers/dma/qcom/Makefile b/drivers/dma/qcom/Makefile
index 346e643fbb6d..50f1e7014693 100644
--- a/drivers/dma/qcom/Makefile
+++ b/drivers/dma/qcom/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_QCOM_ADM) += qcom_adm.o
 obj-$(CONFIG_QCOM_BAM_DMA) += bam_dma.o
+obj-$(CONFIG_QCOM_GPI_DMA) += gpi.o
 obj-$(CONFIG_QCOM_HIDMA_MGMT) += hdma_mgmt.o
 hdma_mgmt-objs	 := hidma_mgmt.o hidma_mgmt_sys.o
 obj-$(CONFIG_QCOM_HIDMA) +=  hdma.o
diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c
new file mode 100644
index 000000000000..d2334f535de2
--- /dev/null
+++ b/drivers/dma/qcom/gpi.c
@@ -0,0 +1,2303 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2017-2020, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2020, Linaro Limited
+ */
+
+#include <dt-bindings/dma/qcom-gpi.h>
+#include <linux/bitfield.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/module.h>
+#include <linux/of_dma.h>
+#include <linux/platform_device.h>
+#include <linux/dma/qcom-gpi-dma.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include "../dmaengine.h"
+#include "../virt-dma.h"
+
+#define TRE_TYPE_DMA		0x10
+#define TRE_TYPE_GO		0x20
+#define TRE_TYPE_CONFIG0	0x22
+
+/* TRE flags */
+#define TRE_FLAGS_CHAIN		BIT(0)
+#define TRE_FLAGS_IEOB		BIT(8)
+#define TRE_FLAGS_IEOT		BIT(9)
+#define TRE_FLAGS_BEI		BIT(10)
+#define TRE_FLAGS_LINK		BIT(11)
+#define TRE_FLAGS_TYPE		GENMASK(23, 16)
+
+/* SPI CONFIG0 WD0 */
+#define TRE_SPI_C0_WORD_SZ	GENMASK(4, 0)
+#define TRE_SPI_C0_LOOPBACK	BIT(8)
+#define TRE_SPI_C0_CS		BIT(11)
+#define TRE_SPI_C0_CPHA		BIT(12)
+#define TRE_SPI_C0_CPOL		BIT(13)
+#define TRE_SPI_C0_TX_PACK	BIT(24)
+#define TRE_SPI_C0_RX_PACK	BIT(25)
+
+/* CONFIG0 WD2 */
+#define TRE_C0_CLK_DIV		GENMASK(11, 0)
+#define TRE_C0_CLK_SRC		GENMASK(19, 16)
+
+/* SPI GO WD0 */
+#define TRE_SPI_GO_CMD		GENMASK(4, 0)
+#define TRE_SPI_GO_CS		GENMASK(10, 8)
+#define TRE_SPI_GO_FRAG		BIT(26)
+
+/* GO WD2 */
+#define TRE_RX_LEN		GENMASK(23, 0)
+
+/* I2C Config0 WD0 */
+#define TRE_I2C_C0_TLOW		GENMASK(7, 0)
+#define TRE_I2C_C0_THIGH	GENMASK(15, 8)
+#define TRE_I2C_C0_TCYL		GENMASK(23, 16)
+#define TRE_I2C_C0_TX_PACK	BIT(24)
+#define TRE_I2C_C0_RX_PACK      BIT(25)
+
+/* I2C GO WD0 */
+#define TRE_I2C_GO_CMD          GENMASK(4, 0)
+#define TRE_I2C_GO_ADDR		GENMASK(14, 8)
+#define TRE_I2C_GO_STRETCH	BIT(26)
+
+/* DMA TRE */
+#define TRE_DMA_LEN		GENMASK(23, 0)
+
+/* Register offsets from gpi-top */
+#define GPII_n_CH_k_CNTXT_0_OFFS(n, k)	(0x20000 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_CH_k_CNTXT_0_EL_SIZE	GENMASK(31, 24)
+#define GPII_n_CH_k_CNTXT_0_CHSTATE	GENMASK(23, 20)
+#define GPII_n_CH_k_CNTXT_0_ERIDX	GENMASK(18, 14)
+#define GPII_n_CH_k_CNTXT_0_DIR		BIT(3)
+#define GPII_n_CH_k_CNTXT_0_PROTO	GENMASK(2, 0)
+
+#define GPII_n_CH_k_CNTXT_0(el_size, erindex, dir, chtype_proto)  \
+	(FIELD_PREP(GPII_n_CH_k_CNTXT_0_EL_SIZE, el_size)	| \
+	 FIELD_PREP(GPII_n_CH_k_CNTXT_0_ERIDX, erindex)		| \
+	 FIELD_PREP(GPII_n_CH_k_CNTXT_0_DIR, dir)		| \
+	 FIELD_PREP(GPII_n_CH_k_CNTXT_0_PROTO, chtype_proto))
+
+#define GPI_CHTYPE_DIR_IN	(0)
+#define GPI_CHTYPE_DIR_OUT	(1)
+
+#define GPI_CHTYPE_PROTO_GPI	(0x2)
+
+#define GPII_n_CH_k_DOORBELL_0_OFFS(n, k)	(0x22000 + (0x4000 * (n)) + (0x8 * (k)))
+#define GPII_n_CH_CMD_OFFS(n)			(0x23008 + (0x4000 * (n)))
+#define GPII_n_CH_CMD_OPCODE			GENMASK(31, 24)
+#define GPII_n_CH_CMD_CHID			GENMASK(7, 0)
+#define GPII_n_CH_CMD(opcode, chid)				 \
+		     (FIELD_PREP(GPII_n_CH_CMD_OPCODE, opcode) | \
+		      FIELD_PREP(GPII_n_CH_CMD_CHID, chid))
+
+#define GPII_n_CH_CMD_ALLOCATE		(0)
+#define GPII_n_CH_CMD_START		(1)
+#define GPII_n_CH_CMD_STOP		(2)
+#define GPII_n_CH_CMD_RESET		(9)
+#define GPII_n_CH_CMD_DE_ALLOC		(10)
+#define GPII_n_CH_CMD_UART_SW_STALE	(32)
+#define GPII_n_CH_CMD_UART_RFR_READY	(33)
+#define GPII_n_CH_CMD_UART_RFR_NOT_READY (34)
+
+/* EV Context Array */
+#define GPII_n_EV_CH_k_CNTXT_0_OFFS(n, k) (0x21000 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_EV_k_CNTXT_0_EL_SIZE	GENMASK(31, 24)
+#define GPII_n_EV_k_CNTXT_0_CHSTATE	GENMASK(23, 20)
+#define GPII_n_EV_k_CNTXT_0_INTYPE	BIT(16)
+#define GPII_n_EV_k_CNTXT_0_CHTYPE	GENMASK(3, 0)
+
+#define GPII_n_EV_k_CNTXT_0(el_size, inttype, chtype)		\
+	(FIELD_PREP(GPII_n_EV_k_CNTXT_0_EL_SIZE, el_size) |	\
+	 FIELD_PREP(GPII_n_EV_k_CNTXT_0_INTYPE, inttype)  |	\
+	 FIELD_PREP(GPII_n_EV_k_CNTXT_0_CHTYPE, chtype))
+
+#define GPI_INTTYPE_IRQ		(1)
+#define GPI_CHTYPE_GPI_EV	(0x2)
+
+enum CNTXT_OFFS {
+	CNTXT_0_CONFIG = 0x0,
+	CNTXT_1_R_LENGTH = 0x4,
+	CNTXT_2_RING_BASE_LSB = 0x8,
+	CNTXT_3_RING_BASE_MSB = 0xC,
+	CNTXT_4_RING_RP_LSB = 0x10,
+	CNTXT_5_RING_RP_MSB = 0x14,
+	CNTXT_6_RING_WP_LSB = 0x18,
+	CNTXT_7_RING_WP_MSB = 0x1C,
+	CNTXT_8_RING_INT_MOD = 0x20,
+	CNTXT_9_RING_INTVEC = 0x24,
+	CNTXT_10_RING_MSI_LSB = 0x28,
+	CNTXT_11_RING_MSI_MSB = 0x2C,
+	CNTXT_12_RING_RP_UPDATE_LSB = 0x30,
+	CNTXT_13_RING_RP_UPDATE_MSB = 0x34,
+};
+
+#define GPII_n_EV_CH_k_DOORBELL_0_OFFS(n, k)	(0x22100 + (0x4000 * (n)) + (0x8 * (k)))
+#define GPII_n_EV_CH_CMD_OFFS(n)		(0x23010 + (0x4000 * (n)))
+#define GPII_n_EV_CMD_OPCODE			GENMASK(31, 24)
+#define GPII_n_EV_CMD_CHID			GENMASK(7, 0)
+#define GPII_n_EV_CMD(opcode, chid)				 \
+		     (FIELD_PREP(GPII_n_EV_CMD_OPCODE, opcode) | \
+		      FIELD_PREP(GPII_n_EV_CMD_CHID, chid))
+
+#define GPII_n_EV_CH_CMD_ALLOCATE		(0x00)
+#define GPII_n_EV_CH_CMD_RESET			(0x09)
+#define GPII_n_EV_CH_CMD_DE_ALLOC		(0x0A)
+
+#define GPII_n_CNTXT_TYPE_IRQ_OFFS(n)		(0x23080 + (0x4000 * (n)))
+
+/* mask type register */
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(n)	(0x23088 + (0x4000 * (n)))
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK		GENMASK(6, 0)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_GENERAL	BIT(6)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB		BIT(3)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB		BIT(2)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL	BIT(1)
+#define GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL	BIT(0)
+
+#define GPII_n_CNTXT_SRC_GPII_CH_IRQ_OFFS(n)	(0x23090 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_EV_CH_IRQ_OFFS(n)	(0x23094 + (0x4000 * (n)))
+
+/* Mask channel control interrupt register */
+#define GPII_n_CNTXT_SRC_CH_IRQ_MSK_OFFS(n)	(0x23098 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK	GENMASK(1, 0)
+
+/* Mask event control interrupt register */
+#define GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_OFFS(n)	(0x2309C + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK	BIT(0)
+
+#define GPII_n_CNTXT_SRC_CH_IRQ_CLR_OFFS(n)	(0x230A0 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_EV_CH_IRQ_CLR_OFFS(n)	(0x230A4 + (0x4000 * (n)))
+
+/* Mask event interrupt register */
+#define GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_OFFS(n)	(0x230B8 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK	BIT(0)
+
+#define GPII_n_CNTXT_SRC_IEOB_IRQ_CLR_OFFS(n)	(0x230C0 + (0x4000 * (n)))
+#define GPII_n_CNTXT_GLOB_IRQ_STTS_OFFS(n)	(0x23100 + (0x4000 * (n)))
+#define GPI_GLOB_IRQ_ERROR_INT_MSK		BIT(0)
+
+/* GPII specific Global - Enable bit register */
+#define GPII_n_CNTXT_GLOB_IRQ_EN_OFFS(n)	(0x23108 + (0x4000 * (n)))
+#define GPII_n_CNTXT_GLOB_IRQ_CLR_OFFS(n)	(0x23110 + (0x4000 * (n)))
+#define GPII_n_CNTXT_GPII_IRQ_STTS_OFFS(n)	(0x23118 + (0x4000 * (n)))
+
+/* GPII general interrupt - Enable bit register */
+#define GPII_n_CNTXT_GPII_IRQ_EN_OFFS(n)	(0x23120 + (0x4000 * (n)))
+#define GPII_n_CNTXT_GPII_IRQ_EN_BMSK		GENMASK(3, 0)
+
+#define GPII_n_CNTXT_GPII_IRQ_CLR_OFFS(n)	(0x23128 + (0x4000 * (n)))
+
+/* GPII Interrupt Type register */
+#define GPII_n_CNTXT_INTSET_OFFS(n)		(0x23180 + (0x4000 * (n)))
+#define GPII_n_CNTXT_INTSET_BMSK		BIT(0)
+
+#define GPII_n_CNTXT_MSI_BASE_LSB_OFFS(n)	(0x23188 + (0x4000 * (n)))
+#define GPII_n_CNTXT_MSI_BASE_MSB_OFFS(n)	(0x2318C + (0x4000 * (n)))
+#define GPII_n_CNTXT_SCRATCH_0_OFFS(n)		(0x23400 + (0x4000 * (n)))
+#define GPII_n_CNTXT_SCRATCH_1_OFFS(n)		(0x23404 + (0x4000 * (n)))
+
+#define GPII_n_ERROR_LOG_OFFS(n)		(0x23200 + (0x4000 * (n)))
+
+/* QOS Registers */
+#define GPII_n_CH_k_QOS_OFFS(n, k)		(0x2005C + (0x4000 * (n)) + (0x80 * (k)))
+
+/* Scratch registers */
+#define GPII_n_CH_k_SCRATCH_0_OFFS(n, k)	(0x20060 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_CH_k_SCRATCH_0_SEID		GENMASK(2, 0)
+#define GPII_n_CH_k_SCRATCH_0_PROTO		GENMASK(7, 4)
+#define GPII_n_CH_k_SCRATCH_0_PAIR		GENMASK(20, 16)
+#define GPII_n_CH_k_SCRATCH_0(pair, proto, seid)		\
+			     (FIELD_PREP(GPII_n_CH_k_SCRATCH_0_PAIR, pair)	| \
+			      FIELD_PREP(GPII_n_CH_k_SCRATCH_0_PROTO, proto)	| \
+			      FIELD_PREP(GPII_n_CH_k_SCRATCH_0_SEID, seid))
+#define GPII_n_CH_k_SCRATCH_1_OFFS(n, k)	(0x20064 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_CH_k_SCRATCH_2_OFFS(n, k)	(0x20068 + (0x4000 * (n)) + (0x80 * (k)))
+#define GPII_n_CH_k_SCRATCH_3_OFFS(n, k)	(0x2006C + (0x4000 * (n)) + (0x80 * (k)))
+
+struct __packed gpi_tre {
+	u32 dword[4];
+};
+
+enum msm_gpi_tce_code {
+	MSM_GPI_TCE_SUCCESS = 1,
+	MSM_GPI_TCE_EOT = 2,
+	MSM_GPI_TCE_EOB = 4,
+	MSM_GPI_TCE_UNEXP_ERR = 16,
+};
+
+#define CMD_TIMEOUT_MS		(250)
+
+#define MAX_CHANNELS_PER_GPII	(2)
+#define GPI_TX_CHAN		(0)
+#define GPI_RX_CHAN		(1)
+#define STATE_IGNORE		(U32_MAX)
+#define EV_FACTOR		(2)
+#define REQ_OF_DMA_ARGS		(5) /* # of arguments required from client */
+#define CHAN_TRES		64
+
+struct __packed xfer_compl_event {
+	u64 ptr;
+	u32 length:24;
+	u8 code;
+	u16 status;
+	u8 type;
+	u8 chid;
+};
+
+struct __packed immediate_data_event {
+	u8 data_bytes[8];
+	u8 length:4;
+	u8 resvd:4;
+	u16 tre_index;
+	u8 code;
+	u16 status;
+	u8 type;
+	u8 chid;
+};
+
+struct __packed qup_notif_event {
+	u32 status;
+	u32 time;
+	u32 count:24;
+	u8 resvd;
+	u16 resvd1;
+	u8 type;
+	u8 chid;
+};
+
+struct __packed gpi_ere {
+	u32 dword[4];
+};
+
+enum GPI_EV_TYPE {
+	XFER_COMPLETE_EV_TYPE = 0x22,
+	IMMEDIATE_DATA_EV_TYPE = 0x30,
+	QUP_NOTIF_EV_TYPE = 0x31,
+	STALE_EV_TYPE = 0xFF,
+};
+
+union __packed gpi_event {
+	struct __packed xfer_compl_event xfer_compl_event;
+	struct __packed immediate_data_event immediate_data_event;
+	struct __packed qup_notif_event qup_notif_event;
+	struct __packed gpi_ere gpi_ere;
+};
+
+enum gpii_irq_settings {
+	DEFAULT_IRQ_SETTINGS,
+	MASK_IEOB_SETTINGS,
+};
+
+enum gpi_ev_state {
+	DEFAULT_EV_CH_STATE = 0,
+	EV_STATE_NOT_ALLOCATED = DEFAULT_EV_CH_STATE,
+	EV_STATE_ALLOCATED,
+	MAX_EV_STATES
+};
+
+static const char *const gpi_ev_state_str[MAX_EV_STATES] = {
+	[EV_STATE_NOT_ALLOCATED] = "NOT ALLOCATED",
+	[EV_STATE_ALLOCATED] = "ALLOCATED",
+};
+
+#define TO_GPI_EV_STATE_STR(_state) (((_state) >= MAX_EV_STATES) ? \
+				    "INVALID" : gpi_ev_state_str[(_state)])
+
+enum gpi_ch_state {
+	DEFAULT_CH_STATE = 0x0,
+	CH_STATE_NOT_ALLOCATED = DEFAULT_CH_STATE,
+	CH_STATE_ALLOCATED = 0x1,
+	CH_STATE_STARTED = 0x2,
+	CH_STATE_STOPPED = 0x3,
+	CH_STATE_STOP_IN_PROC = 0x4,
+	CH_STATE_ERROR = 0xf,
+	MAX_CH_STATES
+};
+
+enum gpi_cmd {
+	GPI_CH_CMD_BEGIN,
+	GPI_CH_CMD_ALLOCATE = GPI_CH_CMD_BEGIN,
+	GPI_CH_CMD_START,
+	GPI_CH_CMD_STOP,
+	GPI_CH_CMD_RESET,
+	GPI_CH_CMD_DE_ALLOC,
+	GPI_CH_CMD_UART_SW_STALE,
+	GPI_CH_CMD_UART_RFR_READY,
+	GPI_CH_CMD_UART_RFR_NOT_READY,
+	GPI_CH_CMD_END = GPI_CH_CMD_UART_RFR_NOT_READY,
+	GPI_EV_CMD_BEGIN,
+	GPI_EV_CMD_ALLOCATE = GPI_EV_CMD_BEGIN,
+	GPI_EV_CMD_RESET,
+	GPI_EV_CMD_DEALLOC,
+	GPI_EV_CMD_END = GPI_EV_CMD_DEALLOC,
+	GPI_MAX_CMD,
+};
+
+#define IS_CHAN_CMD(_cmd) ((_cmd) <= GPI_CH_CMD_END)
+
+static const char *const gpi_cmd_str[GPI_MAX_CMD] = {
+	[GPI_CH_CMD_ALLOCATE] = "CH ALLOCATE",
+	[GPI_CH_CMD_START] = "CH START",
+	[GPI_CH_CMD_STOP] = "CH STOP",
+	[GPI_CH_CMD_RESET] = "CH_RESET",
+	[GPI_CH_CMD_DE_ALLOC] = "DE ALLOC",
+	[GPI_CH_CMD_UART_SW_STALE] = "UART SW STALE",
+	[GPI_CH_CMD_UART_RFR_READY] = "UART RFR READY",
+	[GPI_CH_CMD_UART_RFR_NOT_READY] = "UART RFR NOT READY",
+	[GPI_EV_CMD_ALLOCATE] = "EV ALLOCATE",
+	[GPI_EV_CMD_RESET] = "EV RESET",
+	[GPI_EV_CMD_DEALLOC] = "EV DEALLOC",
+};
+
+#define TO_GPI_CMD_STR(_cmd) (((_cmd) >= GPI_MAX_CMD) ? "INVALID" : \
+				  gpi_cmd_str[(_cmd)])
+
+/*
+ * @DISABLE_STATE: no register access allowed
+ * @CONFIG_STATE:  client has configured the channel
+ * @PREP_HARDWARE: register access is allowed
+ *		   however, no processing EVENTS
+ * @ACTIVE_STATE: channels are fully operational
+ * @PREPARE_TERMINATE: graceful termination of channels
+ *		       register access is allowed
+ * @PAUSE_STATE: channels are active, but not processing any events
+ */
+enum gpi_pm_state {
+	DISABLE_STATE,
+	CONFIG_STATE,
+	PREPARE_HARDWARE,
+	ACTIVE_STATE,
+	PREPARE_TERMINATE,
+	PAUSE_STATE,
+	MAX_PM_STATE
+};
+
+#define REG_ACCESS_VALID(_pm_state) ((_pm_state) >= PREPARE_HARDWARE)
+
+static const char *const gpi_pm_state_str[MAX_PM_STATE] = {
+	[DISABLE_STATE] = "DISABLE",
+	[CONFIG_STATE] = "CONFIG",
+	[PREPARE_HARDWARE] = "PREPARE HARDWARE",
+	[ACTIVE_STATE] = "ACTIVE",
+	[PREPARE_TERMINATE] = "PREPARE TERMINATE",
+	[PAUSE_STATE] = "PAUSE",
+};
+
+#define TO_GPI_PM_STR(_state) (((_state) >= MAX_PM_STATE) ? \
+			      "INVALID" : gpi_pm_state_str[(_state)])
+
+static const struct {
+	enum gpi_cmd gpi_cmd;
+	u32 opcode;
+	u32 state;
+} gpi_cmd_info[GPI_MAX_CMD] = {
+	{
+		GPI_CH_CMD_ALLOCATE,
+		GPII_n_CH_CMD_ALLOCATE,
+		CH_STATE_ALLOCATED,
+	},
+	{
+		GPI_CH_CMD_START,
+		GPII_n_CH_CMD_START,
+		CH_STATE_STARTED,
+	},
+	{
+		GPI_CH_CMD_STOP,
+		GPII_n_CH_CMD_STOP,
+		CH_STATE_STOPPED,
+	},
+	{
+		GPI_CH_CMD_RESET,
+		GPII_n_CH_CMD_RESET,
+		CH_STATE_ALLOCATED,
+	},
+	{
+		GPI_CH_CMD_DE_ALLOC,
+		GPII_n_CH_CMD_DE_ALLOC,
+		CH_STATE_NOT_ALLOCATED,
+	},
+	{
+		GPI_CH_CMD_UART_SW_STALE,
+		GPII_n_CH_CMD_UART_SW_STALE,
+		STATE_IGNORE,
+	},
+	{
+		GPI_CH_CMD_UART_RFR_READY,
+		GPII_n_CH_CMD_UART_RFR_READY,
+		STATE_IGNORE,
+	},
+	{
+		GPI_CH_CMD_UART_RFR_NOT_READY,
+		GPII_n_CH_CMD_UART_RFR_NOT_READY,
+		STATE_IGNORE,
+	},
+	{
+		GPI_EV_CMD_ALLOCATE,
+		GPII_n_EV_CH_CMD_ALLOCATE,
+		EV_STATE_ALLOCATED,
+	},
+	{
+		GPI_EV_CMD_RESET,
+		GPII_n_EV_CH_CMD_RESET,
+		EV_STATE_ALLOCATED,
+	},
+	{
+		GPI_EV_CMD_DEALLOC,
+		GPII_n_EV_CH_CMD_DE_ALLOC,
+		EV_STATE_NOT_ALLOCATED,
+	},
+};
+
+struct gpi_ring {
+	void *pre_aligned;
+	size_t alloc_size;
+	phys_addr_t phys_addr;
+	dma_addr_t dma_handle;
+	void *base;
+	void *wp;
+	void *rp;
+	u32 len;
+	u32 el_size;
+	u32 elements;
+	bool configured;
+};
+
+struct gpi_dev {
+	struct dma_device dma_device;
+	struct device *dev;
+	struct resource *res;
+	void __iomem *regs;
+	void __iomem *ee_base; /*ee register base address*/
+	u32 max_gpii; /* maximum # of gpii instances available per gpi block */
+	u32 gpii_mask; /* gpii instances available for apps */
+	u32 ev_factor; /* ev ring length factor */
+	struct gpii *gpiis;
+};
+
+struct reg_info {
+	char *name;
+	u32 offset;
+	u32 val;
+};
+
+struct gchan {
+	struct virt_dma_chan vc;
+	u32 chid;
+	u32 seid;
+	u32 protocol;
+	struct gpii *gpii;
+	enum gpi_ch_state ch_state;
+	enum gpi_pm_state pm_state;
+	void __iomem *ch_cntxt_base_reg;
+	void __iomem *ch_cntxt_db_reg;
+	void __iomem *ch_cmd_reg;
+	u32 dir;
+	struct gpi_ring ch_ring;
+	void *config;
+};
+
+struct gpii {
+	u32 gpii_id;
+	struct gchan gchan[MAX_CHANNELS_PER_GPII];
+	struct gpi_dev *gpi_dev;
+	int irq;
+	void __iomem *regs; /* points to gpi top */
+	void __iomem *ev_cntxt_base_reg;
+	void __iomem *ev_cntxt_db_reg;
+	void __iomem *ev_ring_rp_lsb_reg;
+	void __iomem *ev_cmd_reg;
+	void __iomem *ieob_clr_reg;
+	struct mutex ctrl_lock;
+	enum gpi_ev_state ev_state;
+	bool configured_irq;
+	enum gpi_pm_state pm_state;
+	rwlock_t pm_lock;
+	struct gpi_ring ev_ring;
+	struct tasklet_struct ev_task; /* event processing tasklet */
+	struct completion cmd_completion;
+	enum gpi_cmd gpi_cmd;
+	u32 cntxt_type_irq_msk;
+	bool ieob_set;
+};
+
+#define MAX_TRE 3
+
+struct gpi_desc {
+	struct virt_dma_desc vd;
+	size_t len;
+	void *db; /* DB register to program */
+	struct gchan *gchan;
+	struct gpi_tre tre[MAX_TRE];
+	u32 num_tre;
+};
+
+static const u32 GPII_CHAN_DIR[MAX_CHANNELS_PER_GPII] = {
+	GPI_CHTYPE_DIR_OUT, GPI_CHTYPE_DIR_IN
+};
+
+static irqreturn_t gpi_handle_irq(int irq, void *data);
+static void gpi_ring_recycle_ev_element(struct gpi_ring *ring);
+static int gpi_ring_add_element(struct gpi_ring *ring, void **wp);
+static void gpi_process_events(struct gpii *gpii);
+
+static inline struct gchan *to_gchan(struct dma_chan *dma_chan)
+{
+	return container_of(dma_chan, struct gchan, vc.chan);
+}
+
+static inline struct gpi_desc *to_gpi_desc(struct virt_dma_desc *vd)
+{
+	return container_of(vd, struct gpi_desc, vd);
+}
+
+static inline phys_addr_t to_physical(const struct gpi_ring *const ring,
+				      void *addr)
+{
+	return ring->phys_addr + (addr - ring->base);
+}
+
+static inline void *to_virtual(const struct gpi_ring *const ring, phys_addr_t addr)
+{
+	return ring->base + (addr - ring->phys_addr);
+}
+
+static inline u32 gpi_read_reg(struct gpii *gpii, void __iomem *addr)
+{
+	return readl_relaxed(addr);
+}
+
+static inline void gpi_write_reg(struct gpii *gpii, void __iomem *addr, u32 val)
+{
+	writel_relaxed(val, addr);
+}
+
+/* gpi_write_reg_field - write to specific bit field */
+static inline void gpi_write_reg_field(struct gpii *gpii, void __iomem *addr,
+				       u32 mask, u32 shift, u32 val)
+{
+	u32 tmp = gpi_read_reg(gpii, addr);
+
+	tmp &= ~mask;
+	val = tmp | ((val << shift) & mask);
+	gpi_write_reg(gpii, addr, val);
+}
+
+static inline void
+gpi_update_reg(struct gpii *gpii, u32 offset, u32 mask, u32 val)
+{
+	void __iomem *addr = gpii->regs + offset;
+	u32 tmp = gpi_read_reg(gpii, addr);
+
+	tmp &= ~mask;
+	tmp |= u32_encode_bits(val, mask);
+
+	gpi_write_reg(gpii, addr, tmp);
+}
+
+static void gpi_disable_interrupts(struct gpii *gpii)
+{
+	gpi_update_reg(gpii, GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_SRC_CH_IRQ_MSK_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_GLOB_IRQ_EN_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_GPII_IRQ_EN_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_GPII_IRQ_EN_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_GPII_IRQ_EN_BMSK, 0);
+	gpi_update_reg(gpii, GPII_n_CNTXT_INTSET_OFFS(gpii->gpii_id),
+		       GPII_n_CNTXT_INTSET_BMSK, 0);
+
+	gpii->cntxt_type_irq_msk = 0;
+	devm_free_irq(gpii->gpi_dev->dev, gpii->irq, gpii);
+	gpii->configured_irq = false;
+}
+
+/* configure and enable interrupts */
+static int gpi_config_interrupts(struct gpii *gpii, enum gpii_irq_settings settings, bool mask)
+{
+	const u32 enable = (GPII_n_CNTXT_TYPE_IRQ_MSK_GENERAL |
+			      GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB |
+			      GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB |
+			      GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL |
+			      GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL);
+	int ret;
+
+	if (!gpii->configured_irq) {
+		ret = devm_request_irq(gpii->gpi_dev->dev, gpii->irq,
+				       gpi_handle_irq, IRQF_TRIGGER_HIGH,
+				       "gpi-dma", gpii);
+		if (ret < 0) {
+			dev_err(gpii->gpi_dev->dev, "error request irq:%d ret:%d\n",
+				gpii->irq, ret);
+			return ret;
+		}
+	}
+
+	if (settings == MASK_IEOB_SETTINGS) {
+		/*
+		 * GPII only uses one EV ring per gpii so we can globally
+		 * enable/disable IEOB interrupt
+		 */
+		if (mask)
+			gpii->cntxt_type_irq_msk |= GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB;
+		else
+			gpii->cntxt_type_irq_msk &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB);
+		gpi_update_reg(gpii, GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK, gpii->cntxt_type_irq_msk);
+	} else {
+		gpi_update_reg(gpii, GPII_n_CNTXT_TYPE_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_TYPE_IRQ_MSK_BMSK, enable);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK,
+			       GPII_n_CNTXT_SRC_IEOB_IRQ_MSK_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SRC_CH_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK,
+			       GPII_n_CNTXT_SRC_CH_IRQ_MSK_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK,
+			       GPII_n_CNTXT_SRC_EV_CH_IRQ_MSK_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_GLOB_IRQ_EN_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_GPII_IRQ_EN_BMSK,
+			       GPII_n_CNTXT_GPII_IRQ_EN_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_GPII_IRQ_EN_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_GPII_IRQ_EN_BMSK, GPII_n_CNTXT_GPII_IRQ_EN_BMSK);
+		gpi_update_reg(gpii, GPII_n_CNTXT_MSI_BASE_LSB_OFFS(gpii->gpii_id), U32_MAX, 0);
+		gpi_update_reg(gpii, GPII_n_CNTXT_MSI_BASE_MSB_OFFS(gpii->gpii_id), U32_MAX, 0);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SCRATCH_0_OFFS(gpii->gpii_id), U32_MAX, 0);
+		gpi_update_reg(gpii, GPII_n_CNTXT_SCRATCH_1_OFFS(gpii->gpii_id), U32_MAX, 0);
+		gpi_update_reg(gpii, GPII_n_CNTXT_INTSET_OFFS(gpii->gpii_id),
+			       GPII_n_CNTXT_INTSET_BMSK, 1);
+		gpi_update_reg(gpii, GPII_n_ERROR_LOG_OFFS(gpii->gpii_id), U32_MAX, 0);
+
+		gpii->cntxt_type_irq_msk = enable;
+	}
+
+	gpii->configured_irq = true;
+	return 0;
+}
+
+/* Sends gpii event or channel command */
+static int gpi_send_cmd(struct gpii *gpii, struct gchan *gchan,
+			enum gpi_cmd gpi_cmd)
+{
+	u32 chid = MAX_CHANNELS_PER_GPII;
+	unsigned long timeout;
+	void __iomem *cmd_reg;
+	u32 cmd;
+
+	if (gpi_cmd >= GPI_MAX_CMD)
+		return -EINVAL;
+	if (IS_CHAN_CMD(gpi_cmd))
+		chid = gchan->chid;
+
+	dev_dbg(gpii->gpi_dev->dev,
+		"sending cmd: %s:%u\n", TO_GPI_CMD_STR(gpi_cmd), chid);
+
+	/* send opcode and wait for completion */
+	reinit_completion(&gpii->cmd_completion);
+	gpii->gpi_cmd = gpi_cmd;
+
+	cmd_reg = IS_CHAN_CMD(gpi_cmd) ? gchan->ch_cmd_reg : gpii->ev_cmd_reg;
+	cmd = IS_CHAN_CMD(gpi_cmd) ? GPII_n_CH_CMD(gpi_cmd_info[gpi_cmd].opcode, chid) :
+				     GPII_n_EV_CMD(gpi_cmd_info[gpi_cmd].opcode, 0);
+	gpi_write_reg(gpii, cmd_reg, cmd);
+	timeout = wait_for_completion_timeout(&gpii->cmd_completion,
+					      msecs_to_jiffies(CMD_TIMEOUT_MS));
+	if (!timeout) {
+		dev_err(gpii->gpi_dev->dev, "cmd: %s completion timeout:%u\n",
+			TO_GPI_CMD_STR(gpi_cmd), chid);
+		return -EIO;
+	}
+
+	/* confirm new ch state is correct , if the cmd is a state change cmd */
+	if (gpi_cmd_info[gpi_cmd].state == STATE_IGNORE)
+		return 0;
+
+	if (IS_CHAN_CMD(gpi_cmd) && gchan->ch_state == gpi_cmd_info[gpi_cmd].state)
+		return 0;
+
+	if (!IS_CHAN_CMD(gpi_cmd) && gpii->ev_state == gpi_cmd_info[gpi_cmd].state)
+		return 0;
+
+	return -EIO;
+}
+
+/* program transfer ring DB register */
+static inline void gpi_write_ch_db(struct gchan *gchan,
+				   struct gpi_ring *ring, void *wp)
+{
+	struct gpii *gpii = gchan->gpii;
+	phys_addr_t p_wp;
+
+	p_wp = to_physical(ring, wp);
+	gpi_write_reg(gpii, gchan->ch_cntxt_db_reg, p_wp);
+}
+
+/* program event ring DB register */
+static inline void gpi_write_ev_db(struct gpii *gpii,
+				   struct gpi_ring *ring, void *wp)
+{
+	phys_addr_t p_wp;
+
+	p_wp = ring->phys_addr + (wp - ring->base);
+	gpi_write_reg(gpii, gpii->ev_cntxt_db_reg, p_wp);
+}
+
+/* process transfer completion interrupt */
+static void gpi_process_ieob(struct gpii *gpii)
+{
+	gpi_write_reg(gpii, gpii->ieob_clr_reg, BIT(0));
+
+	gpi_config_interrupts(gpii, MASK_IEOB_SETTINGS, 0);
+	tasklet_hi_schedule(&gpii->ev_task);
+}
+
+/* process channel control interrupt */
+static void gpi_process_ch_ctrl_irq(struct gpii *gpii)
+{
+	u32 gpii_id = gpii->gpii_id;
+	u32 offset = GPII_n_CNTXT_SRC_GPII_CH_IRQ_OFFS(gpii_id);
+	u32 ch_irq = gpi_read_reg(gpii, gpii->regs + offset);
+	struct gchan *gchan;
+	u32 chid, state;
+
+	/* clear the status */
+	offset = GPII_n_CNTXT_SRC_CH_IRQ_CLR_OFFS(gpii_id);
+	gpi_write_reg(gpii, gpii->regs + offset, (u32)ch_irq);
+
+	for (chid = 0; chid < MAX_CHANNELS_PER_GPII; chid++) {
+		if (!(BIT(chid) & ch_irq))
+			continue;
+
+		gchan = &gpii->gchan[chid];
+		state = gpi_read_reg(gpii, gchan->ch_cntxt_base_reg +
+				     CNTXT_0_CONFIG);
+		state = FIELD_GET(GPII_n_CH_k_CNTXT_0_CHSTATE, state);
+
+		/*
+		 * CH_CMD_DEALLOC cmd always successful. However cmd does
+		 * not change hardware status. So overwriting software state
+		 * to default state.
+		 */
+		if (gpii->gpi_cmd == GPI_CH_CMD_DE_ALLOC)
+			state = DEFAULT_CH_STATE;
+		gchan->ch_state = state;
+
+		/*
+		 * Triggering complete all if ch_state is not a stop in process.
+		 * Stop in process is a transition state and we will wait for
+		 * stop interrupt before notifying.
+		 */
+		if (gchan->ch_state != CH_STATE_STOP_IN_PROC)
+			complete_all(&gpii->cmd_completion);
+	}
+}
+
+/* processing gpi general error interrupts */
+static void gpi_process_gen_err_irq(struct gpii *gpii)
+{
+	u32 gpii_id = gpii->gpii_id;
+	u32 offset = GPII_n_CNTXT_GPII_IRQ_STTS_OFFS(gpii_id);
+	u32 irq_stts = gpi_read_reg(gpii, gpii->regs + offset);
+
+	/* clear the status */
+	dev_dbg(gpii->gpi_dev->dev, "irq_stts:0x%x\n", irq_stts);
+
+	/* Clear the register */
+	offset = GPII_n_CNTXT_GPII_IRQ_CLR_OFFS(gpii_id);
+	gpi_write_reg(gpii, gpii->regs + offset, irq_stts);
+}
+
+/* processing gpi level error interrupts */
+static void gpi_process_glob_err_irq(struct gpii *gpii)
+{
+	u32 gpii_id = gpii->gpii_id;
+	u32 offset = GPII_n_CNTXT_GLOB_IRQ_STTS_OFFS(gpii_id);
+	u32 irq_stts = gpi_read_reg(gpii, gpii->regs + offset);
+
+	offset = GPII_n_CNTXT_GLOB_IRQ_CLR_OFFS(gpii_id);
+	gpi_write_reg(gpii, gpii->regs + offset, irq_stts);
+
+	/* only error interrupt should be set */
+	if (irq_stts & ~GPI_GLOB_IRQ_ERROR_INT_MSK) {
+		dev_err(gpii->gpi_dev->dev, "invalid error status:0x%x\n", irq_stts);
+		return;
+	}
+
+	offset = GPII_n_ERROR_LOG_OFFS(gpii_id);
+	gpi_write_reg(gpii, gpii->regs + offset, 0);
+}
+
+/* gpii interrupt handler */
+static irqreturn_t gpi_handle_irq(int irq, void *data)
+{
+	struct gpii *gpii = data;
+	u32 gpii_id = gpii->gpii_id;
+	u32 type, offset;
+	unsigned long flags;
+
+	read_lock_irqsave(&gpii->pm_lock, flags);
+
+	/*
+	 * States are out of sync to receive interrupt
+	 * while software state is in DISABLE state, bailing out.
+	 */
+	if (!REG_ACCESS_VALID(gpii->pm_state)) {
+		dev_err(gpii->gpi_dev->dev, "receive interrupt while in %s state\n",
+			TO_GPI_PM_STR(gpii->pm_state));
+		goto exit_irq;
+	}
+
+	offset = GPII_n_CNTXT_TYPE_IRQ_OFFS(gpii->gpii_id);
+	type = gpi_read_reg(gpii, gpii->regs + offset);
+
+	do {
+		/* global gpii error */
+		if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB) {
+			gpi_process_glob_err_irq(gpii);
+			type &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_GLOB);
+		}
+
+		/* transfer complete interrupt */
+		if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB) {
+			gpi_process_ieob(gpii);
+			type &= ~GPII_n_CNTXT_TYPE_IRQ_MSK_IEOB;
+		}
+
+		/* event control irq */
+		if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL) {
+			u32 ev_state;
+			u32 ev_ch_irq;
+
+			dev_dbg(gpii->gpi_dev->dev,
+				"processing EV CTRL interrupt\n");
+			offset = GPII_n_CNTXT_SRC_EV_CH_IRQ_OFFS(gpii_id);
+			ev_ch_irq = gpi_read_reg(gpii, gpii->regs + offset);
+
+			offset = GPII_n_CNTXT_SRC_EV_CH_IRQ_CLR_OFFS
+				(gpii_id);
+			gpi_write_reg(gpii, gpii->regs + offset, ev_ch_irq);
+			ev_state = gpi_read_reg(gpii, gpii->ev_cntxt_base_reg +
+						CNTXT_0_CONFIG);
+			ev_state = FIELD_GET(GPII_n_EV_k_CNTXT_0_CHSTATE, ev_state);
+
+			/*
+			 * CMD EV_CMD_DEALLOC is always successful. However
+			 * cmd does not change hardware status. So overwriting
+			 * software state to default state.
+			 */
+			if (gpii->gpi_cmd == GPI_EV_CMD_DEALLOC)
+				ev_state = DEFAULT_EV_CH_STATE;
+
+			gpii->ev_state = ev_state;
+			dev_dbg(gpii->gpi_dev->dev, "setting EV state to %s\n",
+				TO_GPI_EV_STATE_STR(gpii->ev_state));
+			complete_all(&gpii->cmd_completion);
+			type &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_EV_CTRL);
+		}
+
+		/* channel control irq */
+		if (type & GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL) {
+			dev_dbg(gpii->gpi_dev->dev, "process CH CTRL interrupts\n");
+			gpi_process_ch_ctrl_irq(gpii);
+			type &= ~(GPII_n_CNTXT_TYPE_IRQ_MSK_CH_CTRL);
+		}
+
+		if (type) {
+			dev_err(gpii->gpi_dev->dev, "Unhandled interrupt status:0x%x\n", type);
+			gpi_process_gen_err_irq(gpii);
+			goto exit_irq;
+		}
+
+		offset = GPII_n_CNTXT_TYPE_IRQ_OFFS(gpii->gpii_id);
+		type = gpi_read_reg(gpii, gpii->regs + offset);
+	} while (type);
+
+exit_irq:
+	read_unlock_irqrestore(&gpii->pm_lock, flags);
+
+	return IRQ_HANDLED;
+}
+
+/* process DMA Immediate completion data events */
+static void gpi_process_imed_data_event(struct gchan *gchan,
+					struct immediate_data_event *imed_event)
+{
+	struct gpii *gpii = gchan->gpii;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	void *tre = ch_ring->base + (ch_ring->el_size * imed_event->tre_index);
+	struct dmaengine_result result;
+	struct gpi_desc *gpi_desc;
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+	u32 chid;
+
+	/*
+	 * If channel not active don't process event
+	 */
+	if (gchan->pm_state != ACTIVE_STATE) {
+		dev_err(gpii->gpi_dev->dev, "skipping processing event because ch @ %s state\n",
+			TO_GPI_PM_STR(gchan->pm_state));
+		return;
+	}
+
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	vd = vchan_next_desc(&gchan->vc);
+	if (!vd) {
+		struct gpi_ere *gpi_ere;
+		struct gpi_tre *gpi_tre;
+
+		spin_unlock_irqrestore(&gchan->vc.lock, flags);
+		dev_dbg(gpii->gpi_dev->dev, "event without a pending descriptor!\n");
+		gpi_ere = (struct gpi_ere *)imed_event;
+		dev_dbg(gpii->gpi_dev->dev,
+			"Event: %08x %08x %08x %08x\n",
+			gpi_ere->dword[0], gpi_ere->dword[1],
+			gpi_ere->dword[2], gpi_ere->dword[3]);
+		gpi_tre = tre;
+		dev_dbg(gpii->gpi_dev->dev,
+			"Pending TRE: %08x %08x %08x %08x\n",
+			gpi_tre->dword[0], gpi_tre->dword[1],
+			gpi_tre->dword[2], gpi_tre->dword[3]);
+		return;
+	}
+	gpi_desc = to_gpi_desc(vd);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+
+	/*
+	 * RP pointed by Event is to last TRE processed,
+	 * we need to update ring rp to tre + 1
+	 */
+	tre += ch_ring->el_size;
+	if (tre >= (ch_ring->base + ch_ring->len))
+		tre = ch_ring->base;
+	ch_ring->rp = tre;
+
+	/* make sure rp updates are immediately visible to all cores */
+	smp_wmb();
+
+	chid = imed_event->chid;
+	if (imed_event->code == MSM_GPI_TCE_EOT && gpii->ieob_set) {
+		if (chid == GPI_RX_CHAN)
+			goto gpi_free_desc;
+		else
+			return;
+	}
+
+	if (imed_event->code == MSM_GPI_TCE_UNEXP_ERR)
+		result.result = DMA_TRANS_ABORTED;
+	else
+		result.result = DMA_TRANS_NOERROR;
+	result.residue = gpi_desc->len - imed_event->length;
+
+	dma_cookie_complete(&vd->tx);
+	dmaengine_desc_get_callback_invoke(&vd->tx, &result);
+
+gpi_free_desc:
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	list_del(&vd->node);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+	kfree(gpi_desc);
+	gpi_desc = NULL;
+}
+
+/* processing transfer completion events */
+static void gpi_process_xfer_compl_event(struct gchan *gchan,
+					 struct xfer_compl_event *compl_event)
+{
+	struct gpii *gpii = gchan->gpii;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	void *ev_rp = to_virtual(ch_ring, compl_event->ptr);
+	struct virt_dma_desc *vd;
+	struct gpi_desc *gpi_desc;
+	struct dmaengine_result result;
+	unsigned long flags;
+	u32 chid;
+
+	/* only process events on active channel */
+	if (unlikely(gchan->pm_state != ACTIVE_STATE)) {
+		dev_err(gpii->gpi_dev->dev, "skipping processing event because ch @ %s state\n",
+			TO_GPI_PM_STR(gchan->pm_state));
+		return;
+	}
+
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	vd = vchan_next_desc(&gchan->vc);
+	if (!vd) {
+		struct gpi_ere *gpi_ere;
+
+		spin_unlock_irqrestore(&gchan->vc.lock, flags);
+		dev_err(gpii->gpi_dev->dev, "Event without a pending descriptor!\n");
+		gpi_ere = (struct gpi_ere *)compl_event;
+		dev_err(gpii->gpi_dev->dev,
+			"Event: %08x %08x %08x %08x\n",
+			gpi_ere->dword[0], gpi_ere->dword[1],
+			gpi_ere->dword[2], gpi_ere->dword[3]);
+		return;
+	}
+
+	gpi_desc = to_gpi_desc(vd);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+
+	/*
+	 * RP pointed by Event is to last TRE processed,
+	 * we need to update ring rp to ev_rp + 1
+	 */
+	ev_rp += ch_ring->el_size;
+	if (ev_rp >= (ch_ring->base + ch_ring->len))
+		ev_rp = ch_ring->base;
+	ch_ring->rp = ev_rp;
+
+	/* update must be visible to other cores */
+	smp_wmb();
+
+	chid = compl_event->chid;
+	if (compl_event->code == MSM_GPI_TCE_EOT && gpii->ieob_set) {
+		if (chid == GPI_RX_CHAN)
+			goto gpi_free_desc;
+		else
+			return;
+	}
+
+	if (compl_event->code == MSM_GPI_TCE_UNEXP_ERR) {
+		dev_err(gpii->gpi_dev->dev, "Error in Transaction\n");
+		result.result = DMA_TRANS_ABORTED;
+	} else {
+		dev_dbg(gpii->gpi_dev->dev, "Transaction Success\n");
+		result.result = DMA_TRANS_NOERROR;
+	}
+	result.residue = gpi_desc->len - compl_event->length;
+	dev_dbg(gpii->gpi_dev->dev, "Residue %d\n", result.residue);
+
+	dma_cookie_complete(&vd->tx);
+	dmaengine_desc_get_callback_invoke(&vd->tx, &result);
+
+gpi_free_desc:
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	list_del(&vd->node);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+	kfree(gpi_desc);
+	gpi_desc = NULL;
+}
+
+/* process all events */
+static void gpi_process_events(struct gpii *gpii)
+{
+	struct gpi_ring *ev_ring = &gpii->ev_ring;
+	phys_addr_t cntxt_rp;
+	void *rp;
+	union gpi_event *gpi_event;
+	struct gchan *gchan;
+	u32 chid, type;
+
+	cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg);
+	rp = to_virtual(ev_ring, cntxt_rp);
+
+	do {
+		while (rp != ev_ring->rp) {
+			gpi_event = ev_ring->rp;
+			chid = gpi_event->xfer_compl_event.chid;
+			type = gpi_event->xfer_compl_event.type;
+
+			dev_dbg(gpii->gpi_dev->dev,
+				"Event: CHID:%u, type:%x %08x %08x %08x %08x\n",
+				chid, type, gpi_event->gpi_ere.dword[0],
+				gpi_event->gpi_ere.dword[1], gpi_event->gpi_ere.dword[2],
+				gpi_event->gpi_ere.dword[3]);
+
+			switch (type) {
+			case XFER_COMPLETE_EV_TYPE:
+				gchan = &gpii->gchan[chid];
+				gpi_process_xfer_compl_event(gchan,
+							     &gpi_event->xfer_compl_event);
+				break;
+			case STALE_EV_TYPE:
+				dev_dbg(gpii->gpi_dev->dev, "stale event, not processing\n");
+				break;
+			case IMMEDIATE_DATA_EV_TYPE:
+				gchan = &gpii->gchan[chid];
+				gpi_process_imed_data_event(gchan,
+							    &gpi_event->immediate_data_event);
+				break;
+			case QUP_NOTIF_EV_TYPE:
+				dev_dbg(gpii->gpi_dev->dev, "QUP_NOTIF_EV_TYPE\n");
+				break;
+			default:
+				dev_dbg(gpii->gpi_dev->dev,
+					"not supported event type:0x%x\n", type);
+			}
+			gpi_ring_recycle_ev_element(ev_ring);
+		}
+		gpi_write_ev_db(gpii, ev_ring, ev_ring->wp);
+
+		/* clear pending IEOB events */
+		gpi_write_reg(gpii, gpii->ieob_clr_reg, BIT(0));
+
+		cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg);
+		rp = to_virtual(ev_ring, cntxt_rp);
+
+	} while (rp != ev_ring->rp);
+}
+
+/* processing events using tasklet */
+static void gpi_ev_tasklet(unsigned long data)
+{
+	struct gpii *gpii = (struct gpii *)data;
+
+	read_lock_bh(&gpii->pm_lock);
+	if (!REG_ACCESS_VALID(gpii->pm_state)) {
+		read_unlock_bh(&gpii->pm_lock);
+		dev_err(gpii->gpi_dev->dev, "not processing any events, pm_state:%s\n",
+			TO_GPI_PM_STR(gpii->pm_state));
+		return;
+	}
+
+	/* process the events */
+	gpi_process_events(gpii);
+
+	/* enable IEOB, switching back to interrupts */
+	gpi_config_interrupts(gpii, MASK_IEOB_SETTINGS, 1);
+	read_unlock_bh(&gpii->pm_lock);
+}
+
+/* marks all pending events for the channel as stale */
+static void gpi_mark_stale_events(struct gchan *gchan)
+{
+	struct gpii *gpii = gchan->gpii;
+	struct gpi_ring *ev_ring = &gpii->ev_ring;
+	u32 cntxt_rp, local_rp;
+	void *ev_rp;
+
+	cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg);
+
+	ev_rp = ev_ring->rp;
+	local_rp = (u32)to_physical(ev_ring, ev_rp);
+	while (local_rp != cntxt_rp) {
+		union gpi_event *gpi_event = ev_rp;
+		u32 chid = gpi_event->xfer_compl_event.chid;
+
+		if (chid == gchan->chid)
+			gpi_event->xfer_compl_event.type = STALE_EV_TYPE;
+		ev_rp += ev_ring->el_size;
+		if (ev_rp >= (ev_ring->base + ev_ring->len))
+			ev_rp = ev_ring->base;
+		cntxt_rp = gpi_read_reg(gpii, gpii->ev_ring_rp_lsb_reg);
+		local_rp = (u32)to_physical(ev_ring, ev_rp);
+	}
+}
+
+/* reset sw state and issue channel reset or de-alloc */
+static int gpi_reset_chan(struct gchan *gchan, enum gpi_cmd gpi_cmd)
+{
+	struct gpii *gpii = gchan->gpii;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	unsigned long flags;
+	LIST_HEAD(list);
+	int ret;
+
+	ret = gpi_send_cmd(gpii, gchan, gpi_cmd);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n",
+			TO_GPI_CMD_STR(gpi_cmd), ret);
+		return ret;
+	}
+
+	/* initialize the local ring ptrs */
+	ch_ring->rp = ch_ring->base;
+	ch_ring->wp = ch_ring->base;
+
+	/* visible to other cores */
+	smp_wmb();
+
+	/* check event ring for any stale events */
+	write_lock_irq(&gpii->pm_lock);
+	gpi_mark_stale_events(gchan);
+
+	/* remove all async descriptors */
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	vchan_get_all_descriptors(&gchan->vc, &list);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+	write_unlock_irq(&gpii->pm_lock);
+	vchan_dma_desc_free_list(&gchan->vc, &list);
+
+	return 0;
+}
+
+static int gpi_start_chan(struct gchan *gchan)
+{
+	struct gpii *gpii = gchan->gpii;
+	int ret;
+
+	ret = gpi_send_cmd(gpii, gchan, GPI_CH_CMD_START);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n",
+			TO_GPI_CMD_STR(GPI_CH_CMD_START), ret);
+		return ret;
+	}
+
+	/* gpii CH is active now */
+	write_lock_irq(&gpii->pm_lock);
+	gchan->pm_state = ACTIVE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+	return 0;
+}
+
+static int gpi_stop_chan(struct gchan *gchan)
+{
+	struct gpii *gpii = gchan->gpii;
+	int ret;
+
+	ret = gpi_send_cmd(gpii, gchan, GPI_CH_CMD_STOP);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n",
+			TO_GPI_CMD_STR(GPI_CH_CMD_STOP), ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+/* allocate and configure the transfer channel */
+static int gpi_alloc_chan(struct gchan *chan, bool send_alloc_cmd)
+{
+	struct gpii *gpii = chan->gpii;
+	struct gpi_ring *ring = &chan->ch_ring;
+	int ret;
+	u32 id = gpii->gpii_id;
+	u32 chid = chan->chid;
+	u32 pair_chid = !chid;
+
+	if (send_alloc_cmd) {
+		ret = gpi_send_cmd(gpii, chan, GPI_CH_CMD_ALLOCATE);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error with cmd:%s ret:%d\n",
+				TO_GPI_CMD_STR(GPI_CH_CMD_ALLOCATE), ret);
+			return ret;
+		}
+	}
+
+	gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_0_CONFIG,
+		      GPII_n_CH_k_CNTXT_0(ring->el_size, 0, chan->dir, GPI_CHTYPE_PROTO_GPI));
+	gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_1_R_LENGTH, ring->len);
+	gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_2_RING_BASE_LSB, ring->phys_addr);
+	gpi_write_reg(gpii, chan->ch_cntxt_base_reg + CNTXT_3_RING_BASE_MSB,
+		      upper_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, chan->ch_cntxt_db_reg + CNTXT_5_RING_RP_MSB - CNTXT_4_RING_RP_LSB,
+		      upper_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_0_OFFS(id, chid),
+		      GPII_n_CH_k_SCRATCH_0(pair_chid, chan->protocol, chan->seid));
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_1_OFFS(id, chid), 0);
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_2_OFFS(id, chid), 0);
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_SCRATCH_3_OFFS(id, chid), 0);
+	gpi_write_reg(gpii, gpii->regs + GPII_n_CH_k_QOS_OFFS(id, chid), 1);
+
+	/* flush all the writes */
+	wmb();
+	return 0;
+}
+
+/* allocate and configure event ring */
+static int gpi_alloc_ev_chan(struct gpii *gpii)
+{
+	struct gpi_ring *ring = &gpii->ev_ring;
+	void __iomem *base = gpii->ev_cntxt_base_reg;
+	int ret;
+
+	ret = gpi_send_cmd(gpii, NULL, GPI_EV_CMD_ALLOCATE);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "error with cmd:%s ret:%d\n",
+			TO_GPI_CMD_STR(GPI_EV_CMD_ALLOCATE), ret);
+		return ret;
+	}
+
+	/* program event context */
+	gpi_write_reg(gpii, base + CNTXT_0_CONFIG,
+		      GPII_n_EV_k_CNTXT_0(ring->el_size, GPI_INTTYPE_IRQ, GPI_CHTYPE_GPI_EV));
+	gpi_write_reg(gpii, base + CNTXT_1_R_LENGTH, ring->len);
+	gpi_write_reg(gpii, base + CNTXT_2_RING_BASE_LSB, lower_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, base + CNTXT_3_RING_BASE_MSB, upper_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, gpii->ev_cntxt_db_reg + CNTXT_5_RING_RP_MSB - CNTXT_4_RING_RP_LSB,
+		      upper_32_bits(ring->phys_addr));
+	gpi_write_reg(gpii, base + CNTXT_8_RING_INT_MOD, 0);
+	gpi_write_reg(gpii, base + CNTXT_10_RING_MSI_LSB, 0);
+	gpi_write_reg(gpii, base + CNTXT_11_RING_MSI_MSB, 0);
+	gpi_write_reg(gpii, base + CNTXT_8_RING_INT_MOD, 0);
+	gpi_write_reg(gpii, base + CNTXT_12_RING_RP_UPDATE_LSB, 0);
+	gpi_write_reg(gpii, base + CNTXT_13_RING_RP_UPDATE_MSB, 0);
+
+	/* add events to ring */
+	ring->wp = (ring->base + ring->len - ring->el_size);
+
+	/* flush all the writes */
+	wmb();
+
+	/* gpii is active now */
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = ACTIVE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+	gpi_write_ev_db(gpii, ring, ring->wp);
+
+	return 0;
+}
+
+/* calculate # of ERE/TRE available to queue */
+static int gpi_ring_num_elements_avail(const struct gpi_ring * const ring)
+{
+	int elements = 0;
+
+	if (ring->wp < ring->rp) {
+		elements = ((ring->rp - ring->wp) / ring->el_size) - 1;
+	} else {
+		elements = (ring->rp - ring->base) / ring->el_size;
+		elements += ((ring->base + ring->len - ring->wp) / ring->el_size) - 1;
+	}
+
+	return elements;
+}
+
+static int gpi_ring_add_element(struct gpi_ring *ring, void **wp)
+{
+	if (gpi_ring_num_elements_avail(ring) <= 0)
+		return -ENOMEM;
+
+	*wp = ring->wp;
+	ring->wp += ring->el_size;
+	if (ring->wp  >= (ring->base + ring->len))
+		ring->wp = ring->base;
+
+	/* visible to other cores */
+	smp_wmb();
+
+	return 0;
+}
+
+static void gpi_ring_recycle_ev_element(struct gpi_ring *ring)
+{
+	/* Update the WP */
+	ring->wp += ring->el_size;
+	if (ring->wp  >= (ring->base + ring->len))
+		ring->wp = ring->base;
+
+	/* Update the RP */
+	ring->rp += ring->el_size;
+	if (ring->rp  >= (ring->base + ring->len))
+		ring->rp = ring->base;
+
+	/* visible to other cores */
+	smp_wmb();
+}
+
+static void gpi_free_ring(struct gpi_ring *ring,
+			  struct gpii *gpii)
+{
+	dma_free_coherent(gpii->gpi_dev->dev, ring->alloc_size,
+			  ring->pre_aligned, ring->dma_handle);
+	memset(ring, 0, sizeof(*ring));
+}
+
+/* allocate memory for transfer and event rings */
+static int gpi_alloc_ring(struct gpi_ring *ring, u32 elements,
+			  u32 el_size, struct gpii *gpii)
+{
+	u64 len = elements * el_size;
+	int bit;
+
+	/* ring len must be power of 2 */
+	bit = find_last_bit((unsigned long *)&len, 32);
+	if (((1 << bit) - 1) & len)
+		bit++;
+	len = 1 << bit;
+	ring->alloc_size = (len + (len - 1));
+	dev_dbg(gpii->gpi_dev->dev,
+		"#el:%u el_size:%u len:%u actual_len:%llu alloc_size:%lu\n",
+		  elements, el_size, (elements * el_size), len,
+		  ring->alloc_size);
+
+	ring->pre_aligned = dma_alloc_coherent(gpii->gpi_dev->dev,
+					       ring->alloc_size,
+					       &ring->dma_handle, GFP_KERNEL);
+	if (!ring->pre_aligned) {
+		dev_err(gpii->gpi_dev->dev, "could not alloc size:%lu mem for ring\n",
+			ring->alloc_size);
+		return -ENOMEM;
+	}
+
+	/* align the physical mem */
+	ring->phys_addr = (ring->dma_handle + (len - 1)) & ~(len - 1);
+	ring->base = ring->pre_aligned + (ring->phys_addr - ring->dma_handle);
+	ring->rp = ring->base;
+	ring->wp = ring->base;
+	ring->len = len;
+	ring->el_size = el_size;
+	ring->elements = ring->len / ring->el_size;
+	memset(ring->base, 0, ring->len);
+	ring->configured = true;
+
+	/* update to other cores */
+	smp_wmb();
+
+	dev_dbg(gpii->gpi_dev->dev,
+		"phy_pre:0x%0llx phy_alig:0x%0llx len:%u el_size:%u elements:%u\n",
+		ring->dma_handle, ring->phys_addr, ring->len,
+		ring->el_size, ring->elements);
+
+	return 0;
+}
+
+/* copy tre into transfer ring */
+static void gpi_queue_xfer(struct gpii *gpii, struct gchan *gchan,
+			   struct gpi_tre *gpi_tre, void **wp)
+{
+	struct gpi_tre *ch_tre;
+	int ret;
+
+	/* get next tre location we can copy */
+	ret = gpi_ring_add_element(&gchan->ch_ring, (void **)&ch_tre);
+	if (unlikely(ret)) {
+		dev_err(gpii->gpi_dev->dev, "Error adding ring element to xfer ring\n");
+		return;
+	}
+
+	/* copy the tre info */
+	memcpy(ch_tre, gpi_tre, sizeof(*ch_tre));
+	*wp = ch_tre;
+}
+
+/* reset and restart transfer channel */
+static int gpi_terminate_all(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	int schid, echid, i;
+	int ret = 0;
+
+	mutex_lock(&gpii->ctrl_lock);
+
+	/*
+	 * treat both channels as a group if its protocol is not UART
+	 * STOP, RESET, or START needs to be in lockstep
+	 */
+	schid = (gchan->protocol == QCOM_GPI_UART) ? gchan->chid : 0;
+	echid = (gchan->protocol == QCOM_GPI_UART) ? schid + 1 : MAX_CHANNELS_PER_GPII;
+
+	/* stop the channel */
+	for (i = schid; i < echid; i++) {
+		gchan = &gpii->gchan[i];
+
+		/* disable ch state so no more TRE processing */
+		write_lock_irq(&gpii->pm_lock);
+		gchan->pm_state = PREPARE_TERMINATE;
+		write_unlock_irq(&gpii->pm_lock);
+
+		/* send command to Stop the channel */
+		ret = gpi_stop_chan(gchan);
+	}
+
+	/* reset the channels (clears any pending tre) */
+	for (i = schid; i < echid; i++) {
+		gchan = &gpii->gchan[i];
+
+		ret = gpi_reset_chan(gchan, GPI_CH_CMD_RESET);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error resetting channel ret:%d\n", ret);
+			goto terminate_exit;
+		}
+
+		/* reprogram channel CNTXT */
+		ret = gpi_alloc_chan(gchan, false);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error alloc_channel ret:%d\n", ret);
+			goto terminate_exit;
+		}
+	}
+
+	/* restart the channels */
+	for (i = schid; i < echid; i++) {
+		gchan = &gpii->gchan[i];
+
+		ret = gpi_start_chan(gchan);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error Starting Channel ret:%d\n", ret);
+			goto terminate_exit;
+		}
+	}
+
+terminate_exit:
+	mutex_unlock(&gpii->ctrl_lock);
+	return ret;
+}
+
+/* pause dma transfer for all channels */
+static int gpi_pause(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	int i, ret;
+
+	mutex_lock(&gpii->ctrl_lock);
+
+	/*
+	 * pause/resume are per gpii not per channel, so
+	 * client needs to call pause only once
+	 */
+	if (gpii->pm_state == PAUSE_STATE) {
+		dev_dbg(gpii->gpi_dev->dev, "channel is already paused\n");
+		mutex_unlock(&gpii->ctrl_lock);
+		return 0;
+	}
+
+	/* send stop command to stop the channels */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) {
+		ret = gpi_stop_chan(&gpii->gchan[i]);
+		if (ret) {
+			mutex_unlock(&gpii->ctrl_lock);
+			return ret;
+		}
+	}
+
+	disable_irq(gpii->irq);
+
+	/* Wait for threads to complete out */
+	tasklet_kill(&gpii->ev_task);
+
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = PAUSE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+	mutex_unlock(&gpii->ctrl_lock);
+
+	return 0;
+}
+
+/* resume dma transfer */
+static int gpi_resume(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	int i, ret;
+
+	mutex_lock(&gpii->ctrl_lock);
+	if (gpii->pm_state == ACTIVE_STATE) {
+		dev_dbg(gpii->gpi_dev->dev, "channel is already active\n");
+		mutex_unlock(&gpii->ctrl_lock);
+		return 0;
+	}
+
+	enable_irq(gpii->irq);
+
+	/* send start command to start the channels */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) {
+		ret = gpi_send_cmd(gpii, &gpii->gchan[i], GPI_CH_CMD_START);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error starting chan, ret:%d\n", ret);
+			mutex_unlock(&gpii->ctrl_lock);
+			return ret;
+		}
+	}
+
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = ACTIVE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+	mutex_unlock(&gpii->ctrl_lock);
+
+	return 0;
+}
+
+static void gpi_desc_free(struct virt_dma_desc *vd)
+{
+	struct gpi_desc *gpi_desc = to_gpi_desc(vd);
+
+	kfree(gpi_desc);
+	gpi_desc = NULL;
+}
+
+static int
+gpi_peripheral_config(struct dma_chan *chan, struct dma_slave_config *config)
+{
+	struct gchan *gchan = to_gchan(chan);
+
+	if (!config->peripheral_config)
+		return -EINVAL;
+
+	gchan->config = krealloc(gchan->config, config->peripheral_size, GFP_NOWAIT);
+	if (!gchan->config)
+		return -ENOMEM;
+
+	memcpy(gchan->config, config->peripheral_config, config->peripheral_size);
+
+	return 0;
+}
+
+static int gpi_create_i2c_tre(struct gchan *chan, struct gpi_desc *desc,
+			      struct scatterlist *sgl, enum dma_transfer_direction direction)
+{
+	struct gpi_i2c_config *i2c = chan->config;
+	struct device *dev = chan->gpii->gpi_dev->dev;
+	unsigned int tre_idx = 0;
+	dma_addr_t address;
+	struct gpi_tre *tre;
+	unsigned int i;
+
+	/* first create config tre if applicable */
+	if (i2c->set_config) {
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		tre->dword[0] = u32_encode_bits(i2c->low_count, TRE_I2C_C0_TLOW);
+		tre->dword[0] |= u32_encode_bits(i2c->high_count, TRE_I2C_C0_THIGH);
+		tre->dword[0] |= u32_encode_bits(i2c->cycle_count, TRE_I2C_C0_TCYL);
+		tre->dword[0] |= u32_encode_bits(i2c->pack_enable, TRE_I2C_C0_TX_PACK);
+		tre->dword[0] |= u32_encode_bits(i2c->pack_enable, TRE_I2C_C0_RX_PACK);
+
+		tre->dword[1] = 0;
+
+		tre->dword[2] = u32_encode_bits(i2c->clk_div, TRE_C0_CLK_DIV);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_CONFIG0, TRE_FLAGS_TYPE);
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN);
+	}
+
+	/* create the GO tre for Tx */
+	if (i2c->op == I2C_WRITE) {
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		if (i2c->multi_msg)
+			tre->dword[0] = u32_encode_bits(I2C_READ, TRE_I2C_GO_CMD);
+		else
+			tre->dword[0] = u32_encode_bits(i2c->op, TRE_I2C_GO_CMD);
+
+		tre->dword[0] |= u32_encode_bits(i2c->addr, TRE_I2C_GO_ADDR);
+		tre->dword[0] |= u32_encode_bits(i2c->stretch, TRE_I2C_GO_STRETCH);
+
+		tre->dword[1] = 0;
+		tre->dword[2] = u32_encode_bits(i2c->rx_len, TRE_RX_LEN);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_GO, TRE_FLAGS_TYPE);
+
+		if (i2c->multi_msg)
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_LINK);
+		else
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN);
+	}
+
+	if (i2c->op == I2C_READ || i2c->multi_msg == false) {
+		/* create the DMA TRE */
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		address = sg_dma_address(sgl);
+		tre->dword[0] = lower_32_bits(address);
+		tre->dword[1] = upper_32_bits(address);
+
+		tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
+	};
+
+	for (i = 0; i < tre_idx; i++)
+		dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
+			desc->tre[i].dword[1], desc->tre[i].dword[2], desc->tre[i].dword[3]);
+
+	return tre_idx;
+}
+
+static int gpi_create_spi_tre(struct gchan *chan, struct gpi_desc *desc,
+			      struct scatterlist *sgl, enum dma_transfer_direction direction)
+{
+	struct gpi_spi_config *spi = chan->config;
+	struct device *dev = chan->gpii->gpi_dev->dev;
+	unsigned int tre_idx = 0;
+	dma_addr_t address;
+	struct gpi_tre *tre;
+	unsigned int i;
+
+	/* first create config tre if applicable */
+	if (direction == DMA_MEM_TO_DEV && spi->set_config) {
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		tre->dword[0] = u32_encode_bits(spi->word_len, TRE_SPI_C0_WORD_SZ);
+		tre->dword[0] |= u32_encode_bits(spi->loopback_en, TRE_SPI_C0_LOOPBACK);
+		tre->dword[0] |= u32_encode_bits(spi->clock_pol_high, TRE_SPI_C0_CPOL);
+		tre->dword[0] |= u32_encode_bits(spi->data_pol_high, TRE_SPI_C0_CPHA);
+		tre->dword[0] |= u32_encode_bits(spi->pack_en, TRE_SPI_C0_TX_PACK);
+		tre->dword[0] |= u32_encode_bits(spi->pack_en, TRE_SPI_C0_RX_PACK);
+
+		tre->dword[1] = 0;
+
+		tre->dword[2] = u32_encode_bits(spi->clk_div, TRE_C0_CLK_DIV);
+		tre->dword[2] |= u32_encode_bits(spi->clk_src, TRE_C0_CLK_SRC);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_CONFIG0, TRE_FLAGS_TYPE);
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN);
+	}
+
+	/* create the GO tre for Tx */
+	if (direction == DMA_MEM_TO_DEV) {
+		tre = &desc->tre[tre_idx];
+		tre_idx++;
+
+		tre->dword[0] = u32_encode_bits(spi->fragmentation, TRE_SPI_GO_FRAG);
+		tre->dword[0] |= u32_encode_bits(spi->cs, TRE_SPI_GO_CS);
+		tre->dword[0] |= u32_encode_bits(spi->cmd, TRE_SPI_GO_CMD);
+
+		tre->dword[1] = 0;
+
+		tre->dword[2] = u32_encode_bits(spi->rx_len, TRE_RX_LEN);
+
+		tre->dword[3] = u32_encode_bits(TRE_TYPE_GO, TRE_FLAGS_TYPE);
+		if (spi->cmd == SPI_RX)
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOB);
+		else
+			tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_CHAIN);
+	}
+
+	/* create the dma tre */
+	tre = &desc->tre[tre_idx];
+	tre_idx++;
+
+	address = sg_dma_address(sgl);
+	tre->dword[0] = lower_32_bits(address);
+	tre->dword[1] = upper_32_bits(address);
+
+	tre->dword[2] = u32_encode_bits(sg_dma_len(sgl), TRE_DMA_LEN);
+
+	tre->dword[3] = u32_encode_bits(TRE_TYPE_DMA, TRE_FLAGS_TYPE);
+	if (direction == DMA_MEM_TO_DEV)
+		tre->dword[3] |= u32_encode_bits(1, TRE_FLAGS_IEOT);
+
+	for (i = 0; i < tre_idx; i++)
+		dev_dbg(dev, "TRE:%d %x:%x:%x:%x\n", i, desc->tre[i].dword[0],
+			desc->tre[i].dword[1], desc->tre[i].dword[2], desc->tre[i].dword[3]);
+
+	return tre_idx;
+}
+
+/* copy tre into transfer ring */
+static struct dma_async_tx_descriptor *
+gpi_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
+		  unsigned int sg_len, enum dma_transfer_direction direction,
+		  unsigned long flags, void *context)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	struct device *dev = gpii->gpi_dev->dev;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	struct gpi_desc *gpi_desc;
+	u32 nr, nr_tre = 0;
+	u8 set_config;
+	int i;
+
+	gpii->ieob_set = false;
+	if (!is_slave_direction(direction)) {
+		dev_err(gpii->gpi_dev->dev, "invalid dma direction: %d\n", direction);
+		return NULL;
+	}
+
+	if (sg_len > 1) {
+		dev_err(dev, "Multi sg sent, we support only one atm: %d\n", sg_len);
+		return NULL;
+	}
+
+	nr_tre = 3;
+	set_config = *(u32 *)gchan->config;
+	if (!set_config)
+		nr_tre = 2;
+	if (direction == DMA_DEV_TO_MEM) /* rx */
+		nr_tre = 1;
+
+	/* calculate # of elements required & available */
+	nr = gpi_ring_num_elements_avail(ch_ring);
+	if (nr < nr_tre) {
+		dev_err(dev, "not enough space in ring, avail:%u required:%u\n", nr, nr_tre);
+		return NULL;
+	}
+
+	gpi_desc = kzalloc(sizeof(*gpi_desc), GFP_NOWAIT);
+	if (!gpi_desc)
+		return NULL;
+
+	/* create TREs for xfer */
+	if (gchan->protocol == QCOM_GPI_SPI) {
+		i = gpi_create_spi_tre(gchan, gpi_desc, sgl, direction);
+	} else if (gchan->protocol == QCOM_GPI_I2C) {
+		i = gpi_create_i2c_tre(gchan, gpi_desc, sgl, direction);
+	} else {
+		dev_err(dev, "invalid peripheral: %d\n", gchan->protocol);
+		kfree(gpi_desc);
+		return NULL;
+	}
+
+	/* set up the descriptor */
+	gpi_desc->gchan = gchan;
+	gpi_desc->len = sg_dma_len(sgl);
+	gpi_desc->num_tre  = i;
+
+	return vchan_tx_prep(&gchan->vc, &gpi_desc->vd, flags);
+}
+
+/* rings transfer ring db to being transfer */
+static void gpi_issue_pending(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	unsigned long flags, pm_lock_flags;
+	struct virt_dma_desc *vd = NULL;
+	struct gpi_desc *gpi_desc;
+	struct gpi_ring *ch_ring = &gchan->ch_ring;
+	void *tre, *wp = NULL;
+	int i;
+
+	read_lock_irqsave(&gpii->pm_lock, pm_lock_flags);
+
+	/* move all submitted discriptors to issued list */
+	spin_lock_irqsave(&gchan->vc.lock, flags);
+	if (vchan_issue_pending(&gchan->vc))
+		vd = list_last_entry(&gchan->vc.desc_issued,
+				     struct virt_dma_desc, node);
+	spin_unlock_irqrestore(&gchan->vc.lock, flags);
+
+	/* nothing to do list is empty */
+	if (!vd) {
+		read_unlock_irqrestore(&gpii->pm_lock, pm_lock_flags);
+		return;
+	}
+
+	gpi_desc = to_gpi_desc(vd);
+	for (i = 0; i < gpi_desc->num_tre; i++) {
+		tre = &gpi_desc->tre[i];
+		gpi_queue_xfer(gpii, gchan, tre, &wp);
+	}
+
+	gpi_desc->db = ch_ring->wp;
+	gpi_write_ch_db(gchan, &gchan->ch_ring, gpi_desc->db);
+	read_unlock_irqrestore(&gpii->pm_lock, pm_lock_flags);
+}
+
+static int gpi_ch_init(struct gchan *gchan)
+{
+	struct gpii *gpii = gchan->gpii;
+	const int ev_factor = gpii->gpi_dev->ev_factor;
+	u32 elements;
+	int i = 0, ret = 0;
+
+	gchan->pm_state = CONFIG_STATE;
+
+	/* check if both channels are configured before continue */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++)
+		if (gpii->gchan[i].pm_state != CONFIG_STATE)
+			goto exit_gpi_init;
+
+	/* protocol must be same for both channels */
+	if (gpii->gchan[0].protocol != gpii->gchan[1].protocol) {
+		dev_err(gpii->gpi_dev->dev, "protocol did not match protocol %u != %u\n",
+			gpii->gchan[0].protocol, gpii->gchan[1].protocol);
+		ret = -EINVAL;
+		goto exit_gpi_init;
+	}
+
+	/* allocate memory for event ring */
+	elements = CHAN_TRES << ev_factor;
+	ret = gpi_alloc_ring(&gpii->ev_ring, elements,
+			     sizeof(union gpi_event), gpii);
+	if (ret)
+		goto exit_gpi_init;
+
+	/* configure interrupts */
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = PREPARE_HARDWARE;
+	write_unlock_irq(&gpii->pm_lock);
+	ret = gpi_config_interrupts(gpii, DEFAULT_IRQ_SETTINGS, 0);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "error config. interrupts, ret:%d\n", ret);
+		goto error_config_int;
+	}
+
+	/* allocate event rings */
+	ret = gpi_alloc_ev_chan(gpii);
+	if (ret) {
+		dev_err(gpii->gpi_dev->dev, "error alloc_ev_chan:%d\n", ret);
+		goto error_alloc_ev_ring;
+	}
+
+	/* Allocate all channels */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) {
+		ret = gpi_alloc_chan(&gpii->gchan[i], true);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error allocating chan:%d\n", ret);
+			goto error_alloc_chan;
+		}
+	}
+
+	/* start channels  */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++) {
+		ret = gpi_start_chan(&gpii->gchan[i]);
+		if (ret) {
+			dev_err(gpii->gpi_dev->dev, "Error start chan:%d\n", ret);
+			goto error_start_chan;
+		}
+	}
+	return ret;
+
+error_start_chan:
+	for (i = i - 1; i >= 0; i++) {
+		gpi_stop_chan(&gpii->gchan[i]);
+		gpi_send_cmd(gpii, gchan, GPI_CH_CMD_RESET);
+	}
+	i = 2;
+error_alloc_chan:
+	for (i = i - 1; i >= 0; i--)
+		gpi_reset_chan(gchan, GPI_CH_CMD_DE_ALLOC);
+error_alloc_ev_ring:
+	gpi_disable_interrupts(gpii);
+error_config_int:
+	gpi_free_ring(&gpii->ev_ring, gpii);
+exit_gpi_init:
+	mutex_unlock(&gpii->ctrl_lock);
+	return ret;
+}
+
+/* release all channel resources */
+static void gpi_free_chan_resources(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	enum gpi_pm_state cur_state;
+	int ret, i;
+
+	mutex_lock(&gpii->ctrl_lock);
+
+	cur_state = gchan->pm_state;
+
+	/* disable ch state so no more TRE processing for this channel */
+	write_lock_irq(&gpii->pm_lock);
+	gchan->pm_state = PREPARE_TERMINATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+	/* attempt to do graceful hardware shutdown */
+	if (cur_state == ACTIVE_STATE) {
+		gpi_stop_chan(gchan);
+
+		ret = gpi_send_cmd(gpii, gchan, GPI_CH_CMD_RESET);
+		if (ret)
+			dev_err(gpii->gpi_dev->dev, "error resetting channel:%d\n", ret);
+
+		gpi_reset_chan(gchan, GPI_CH_CMD_DE_ALLOC);
+	}
+
+	/* free all allocated memory */
+	gpi_free_ring(&gchan->ch_ring, gpii);
+	vchan_free_chan_resources(&gchan->vc);
+	kfree(gchan->config);
+
+	write_lock_irq(&gpii->pm_lock);
+	gchan->pm_state = DISABLE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+	/* if other rings are still active exit */
+	for (i = 0; i < MAX_CHANNELS_PER_GPII; i++)
+		if (gpii->gchan[i].ch_ring.configured)
+			goto exit_free;
+
+	/* deallocate EV Ring */
+	cur_state = gpii->pm_state;
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = PREPARE_TERMINATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+	/* wait for threads to complete out */
+	tasklet_kill(&gpii->ev_task);
+
+	/* send command to de allocate event ring */
+	if (cur_state == ACTIVE_STATE)
+		gpi_send_cmd(gpii, NULL, GPI_EV_CMD_DEALLOC);
+
+	gpi_free_ring(&gpii->ev_ring, gpii);
+
+	/* disable interrupts */
+	if (cur_state == ACTIVE_STATE)
+		gpi_disable_interrupts(gpii);
+
+	/* set final state to disable */
+	write_lock_irq(&gpii->pm_lock);
+	gpii->pm_state = DISABLE_STATE;
+	write_unlock_irq(&gpii->pm_lock);
+
+exit_free:
+	mutex_unlock(&gpii->ctrl_lock);
+}
+
+/* allocate channel resources */
+static int gpi_alloc_chan_resources(struct dma_chan *chan)
+{
+	struct gchan *gchan = to_gchan(chan);
+	struct gpii *gpii = gchan->gpii;
+	int ret;
+
+	mutex_lock(&gpii->ctrl_lock);
+
+	/* allocate memory for transfer ring */
+	ret = gpi_alloc_ring(&gchan->ch_ring, CHAN_TRES,
+			     sizeof(struct gpi_tre), gpii);
+	if (ret)
+		goto xfer_alloc_err;
+
+	ret = gpi_ch_init(gchan);
+
+	mutex_unlock(&gpii->ctrl_lock);
+
+	return ret;
+xfer_alloc_err:
+	mutex_unlock(&gpii->ctrl_lock);
+
+	return ret;
+}
+
+static int gpi_find_avail_gpii(struct gpi_dev *gpi_dev, u32 seid)
+{
+	struct gchan *tx_chan, *rx_chan;
+	unsigned int gpii;
+
+	/* check if same seid is already configured for another chid */
+	for (gpii = 0; gpii < gpi_dev->max_gpii; gpii++) {
+		if (!((1 << gpii) & gpi_dev->gpii_mask))
+			continue;
+
+		tx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_TX_CHAN];
+		rx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_RX_CHAN];
+
+		if (rx_chan->vc.chan.client_count && rx_chan->seid == seid)
+			return gpii;
+		if (tx_chan->vc.chan.client_count && tx_chan->seid == seid)
+			return gpii;
+	}
+
+	/* no channels configured with same seid, return next avail gpii */
+	for (gpii = 0; gpii < gpi_dev->max_gpii; gpii++) {
+		if (!((1 << gpii) & gpi_dev->gpii_mask))
+			continue;
+
+		tx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_TX_CHAN];
+		rx_chan = &gpi_dev->gpiis[gpii].gchan[GPI_RX_CHAN];
+
+		/* check if gpii is configured */
+		if (tx_chan->vc.chan.client_count ||
+		    rx_chan->vc.chan.client_count)
+			continue;
+
+		/* found a free gpii */
+		return gpii;
+	}
+
+	/* no gpii instance available to use */
+	return -EIO;
+}
+
+/* gpi_of_dma_xlate: open client requested channel */
+static struct dma_chan *gpi_of_dma_xlate(struct of_phandle_args *args,
+					 struct of_dma *of_dma)
+{
+	struct gpi_dev *gpi_dev = (struct gpi_dev *)of_dma->of_dma_data;
+	u32 seid, chid;
+	int gpii;
+	struct gchan *gchan;
+
+	if (args->args_count < 3) {
+		dev_err(gpi_dev->dev, "gpii require minimum 2 args, client passed:%d args\n",
+			args->args_count);
+		return NULL;
+	}
+
+	chid = args->args[0];
+	if (chid >= MAX_CHANNELS_PER_GPII) {
+		dev_err(gpi_dev->dev, "gpii channel:%d not valid\n", chid);
+		return NULL;
+	}
+
+	seid = args->args[1];
+
+	/* find next available gpii to use */
+	gpii = gpi_find_avail_gpii(gpi_dev, seid);
+	if (gpii < 0) {
+		dev_err(gpi_dev->dev, "no available gpii instances\n");
+		return NULL;
+	}
+
+	gchan = &gpi_dev->gpiis[gpii].gchan[chid];
+	if (gchan->vc.chan.client_count) {
+		dev_err(gpi_dev->dev, "gpii:%d chid:%d seid:%d already configured\n",
+			gpii, chid, gchan->seid);
+		return NULL;
+	}
+
+	gchan->seid = seid;
+	gchan->protocol = args->args[2];
+
+	return dma_get_slave_channel(&gchan->vc.chan);
+}
+
+static int gpi_probe(struct platform_device *pdev)
+{
+	struct gpi_dev *gpi_dev;
+	unsigned int i;
+	int ret;
+
+	gpi_dev = devm_kzalloc(&pdev->dev, sizeof(*gpi_dev), GFP_KERNEL);
+	if (!gpi_dev)
+		return -ENOMEM;
+
+	gpi_dev->dev = &pdev->dev;
+	gpi_dev->res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	gpi_dev->regs = devm_ioremap_resource(gpi_dev->dev, gpi_dev->res);
+	if (IS_ERR(gpi_dev->regs))
+		return PTR_ERR(gpi_dev->regs);
+	gpi_dev->ee_base = gpi_dev->regs;
+
+	ret = of_property_read_u32(gpi_dev->dev->of_node, "dma-channels",
+				   &gpi_dev->max_gpii);
+	if (ret) {
+		dev_err(gpi_dev->dev, "missing 'max-no-gpii' DT node\n");
+		return ret;
+	}
+
+	ret = of_property_read_u32(gpi_dev->dev->of_node, "dma-channel-mask",
+				   &gpi_dev->gpii_mask);
+	if (ret) {
+		dev_err(gpi_dev->dev, "missing 'gpii-mask' DT node\n");
+		return ret;
+	}
+
+	gpi_dev->ev_factor = EV_FACTOR;
+
+	ret = dma_set_mask(gpi_dev->dev, DMA_BIT_MASK(64));
+	if (ret) {
+		dev_err(gpi_dev->dev, "Error setting dma_mask to 64, ret:%d\n", ret);
+		return ret;
+	}
+
+	gpi_dev->gpiis = devm_kzalloc(gpi_dev->dev, sizeof(*gpi_dev->gpiis) *
+				      gpi_dev->max_gpii, GFP_KERNEL);
+	if (!gpi_dev->gpiis)
+		return -ENOMEM;
+
+	/* setup all the supported gpii */
+	INIT_LIST_HEAD(&gpi_dev->dma_device.channels);
+	for (i = 0; i < gpi_dev->max_gpii; i++) {
+		struct gpii *gpii = &gpi_dev->gpiis[i];
+		int chan;
+
+		if (!((1 << i) & gpi_dev->gpii_mask))
+			continue;
+
+		/* set up ev cntxt register map */
+		gpii->ev_cntxt_base_reg = gpi_dev->ee_base + GPII_n_EV_CH_k_CNTXT_0_OFFS(i, 0);
+		gpii->ev_cntxt_db_reg = gpi_dev->ee_base + GPII_n_EV_CH_k_DOORBELL_0_OFFS(i, 0);
+		gpii->ev_ring_rp_lsb_reg = gpii->ev_cntxt_base_reg + CNTXT_4_RING_RP_LSB;
+		gpii->ev_cmd_reg = gpi_dev->ee_base + GPII_n_EV_CH_CMD_OFFS(i);
+		gpii->ieob_clr_reg = gpi_dev->ee_base + GPII_n_CNTXT_SRC_IEOB_IRQ_CLR_OFFS(i);
+
+		/* set up irq */
+		ret = platform_get_irq(pdev, i);
+		if (ret < 0) {
+			dev_err(gpi_dev->dev, "platform_get_irq failed for %d:%d\n", i, ret);
+			return ret;
+		}
+		gpii->irq = ret;
+
+		/* set up channel specific register info */
+		for (chan = 0; chan < MAX_CHANNELS_PER_GPII; chan++) {
+			struct gchan *gchan = &gpii->gchan[chan];
+
+			/* set up ch cntxt register map */
+			gchan->ch_cntxt_base_reg = gpi_dev->ee_base +
+				GPII_n_CH_k_CNTXT_0_OFFS(i, chan);
+			gchan->ch_cntxt_db_reg = gpi_dev->ee_base +
+				GPII_n_CH_k_DOORBELL_0_OFFS(i, chan);
+			gchan->ch_cmd_reg = gpi_dev->ee_base + GPII_n_CH_CMD_OFFS(i);
+
+			/* vchan setup */
+			vchan_init(&gchan->vc, &gpi_dev->dma_device);
+			gchan->vc.desc_free = gpi_desc_free;
+			gchan->chid = chan;
+			gchan->gpii = gpii;
+			gchan->dir = GPII_CHAN_DIR[chan];
+		}
+		mutex_init(&gpii->ctrl_lock);
+		rwlock_init(&gpii->pm_lock);
+		tasklet_init(&gpii->ev_task, gpi_ev_tasklet,
+			     (unsigned long)gpii);
+		init_completion(&gpii->cmd_completion);
+		gpii->gpii_id = i;
+		gpii->regs = gpi_dev->ee_base;
+		gpii->gpi_dev = gpi_dev;
+	}
+
+	platform_set_drvdata(pdev, gpi_dev);
+
+	/* clear and Set capabilities */
+	dma_cap_zero(gpi_dev->dma_device.cap_mask);
+	dma_cap_set(DMA_SLAVE, gpi_dev->dma_device.cap_mask);
+
+	/* configure dmaengine apis */
+	gpi_dev->dma_device.directions = BIT(DMA_DEV_TO_MEM) | BIT(DMA_MEM_TO_DEV);
+	gpi_dev->dma_device.residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
+	gpi_dev->dma_device.src_addr_widths = DMA_SLAVE_BUSWIDTH_8_BYTES;
+	gpi_dev->dma_device.dst_addr_widths = DMA_SLAVE_BUSWIDTH_8_BYTES;
+	gpi_dev->dma_device.device_alloc_chan_resources = gpi_alloc_chan_resources;
+	gpi_dev->dma_device.device_free_chan_resources = gpi_free_chan_resources;
+	gpi_dev->dma_device.device_tx_status = dma_cookie_status;
+	gpi_dev->dma_device.device_issue_pending = gpi_issue_pending;
+	gpi_dev->dma_device.device_prep_slave_sg = gpi_prep_slave_sg;
+	gpi_dev->dma_device.device_config = gpi_peripheral_config;
+	gpi_dev->dma_device.device_terminate_all = gpi_terminate_all;
+	gpi_dev->dma_device.dev = gpi_dev->dev;
+	gpi_dev->dma_device.device_pause = gpi_pause;
+	gpi_dev->dma_device.device_resume = gpi_resume;
+
+	/* register with dmaengine framework */
+	ret = dma_async_device_register(&gpi_dev->dma_device);
+	if (ret) {
+		dev_err(gpi_dev->dev, "async_device_register failed ret:%d", ret);
+		return ret;
+	}
+
+	ret = of_dma_controller_register(gpi_dev->dev->of_node,
+					 gpi_of_dma_xlate, gpi_dev);
+	if (ret) {
+		dev_err(gpi_dev->dev, "of_dma_controller_reg failed ret:%d", ret);
+		return ret;
+	}
+
+	return ret;
+}
+
+static const struct of_device_id gpi_of_match[] = {
+	{ .compatible = "qcom,sdm845-gpi-dma" },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, gpi_of_match);
+
+static struct platform_driver gpi_driver = {
+	.probe = gpi_probe,
+	.driver = {
+		.name = KBUILD_MODNAME,
+		.of_match_table = gpi_of_match,
+	},
+};
+
+static int __init gpi_init(void)
+{
+	return platform_driver_register(&gpi_driver);
+}
+subsys_initcall(gpi_init)
+
+MODULE_DESCRIPTION("QCOM GPI DMA engine driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h
new file mode 100644
index 000000000000..f46dc3372f11
--- /dev/null
+++ b/include/linux/dma/qcom-gpi-dma.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2020, Linaro Limited
+ */
+
+#ifndef QCOM_GPI_DMA_H
+#define QCOM_GPI_DMA_H
+
+/**
+ * enum spi_transfer_cmd - spi transfer commands
+ */
+enum spi_transfer_cmd {
+	SPI_TX = 1,
+	SPI_RX,
+	SPI_DUPLEX,
+};
+
+/**
+ * struct gpi_spi_config - spi config for peripheral
+ *
+ * @loopback_en: spi loopback enable when set
+ * @clock_pol_high: clock polarity
+ * @data_pol_high: data polarity
+ * @pack_en: process tx/rx buffers as packed
+ * @word_len: spi word length
+ * @clk_div: source clock divider
+ * @clk_src: serial clock
+ * @cmd: spi cmd
+ * @fragmentation: keep CS assserted at end of sequence
+ * @cs: chip select toggle
+ * @set_config: set peripheral config
+ * @rx_len: receive length for buffer
+ */
+struct gpi_spi_config {
+	u8 set_config;
+	u8 loopback_en;
+	u8 clock_pol_high;
+	u8 data_pol_high;
+	u8 pack_en;
+	u8 word_len;
+	u8 fragmentation;
+	u8 cs;
+	u32 clk_div;
+	u32 clk_src;
+	enum spi_transfer_cmd cmd;
+	u32 rx_len;
+};
+
+enum i2c_op {
+	I2C_WRITE = 1,
+	I2C_READ,
+};
+
+/**
+ * struct gpi_i2c_config - i2c config for peripheral
+ *
+ * @pack_enable: process tx/rx buffers as packed
+ * @cycle_count: clock cycles to be sent
+ * @high_count: high period of clock
+ * @low_count: low period of clock
+ * @clk_div: source clock divider
+ * @addr: i2c bus address
+ * @stretch: stretch the clock at eot
+ * @set_config: set peripheral config
+ * @rx_len: receive length for buffer
+ * @op: i2c cmd
+ * @muli-msg: is part of multi i2c r-w msgs
+ */
+struct gpi_i2c_config {
+	u8 set_config;
+	u8 pack_enable;
+	u8 cycle_count;
+	u8 high_count;
+	u8 low_count;
+	u8 addr;
+	u8 stretch;
+	u16 clk_div;
+	u32 rx_len;
+	enum i2c_op op;
+	bool multi_msg;
+};
+
+#endif /* QCOM_GPI_DMA_H */
-- 
cgit v1.2.3


From 0801a0073f86e020987acbbd96b50f9c85d79de8 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:14 +0100
Subject: module: drop version-attribute alignment

Commit 98562ad8cb03 ("module: explicitly align module_version_attribute
structure") added an alignment attribute to the struct
module_version_attribute type in order to fix an alignment issue on m68k
where the structure is 2-byte aligned while MODULE_VERSION() forced the
__modver section entries to be 4-byte aligned (sizeof(void *)).

This was essentially an alternative fix to the problem addressed by
b4bc842802db ("module: deal with alignment issues in built-in module
versions") which used the array-of-pointer trick to prevent gcc from
increasing alignment of the version attribute entries. And with the
pointer indirection in place there's no need to increase the alignment
of the type.

Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 7ccdf87f376f..293250958512 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -66,7 +66,7 @@ struct module_version_attribute {
 	struct module_attribute mattr;
 	const char *module_name;
 	const char *version;
-} __attribute__ ((__aligned__(sizeof(void *))));
+};
 
 extern ssize_t __modver_version_show(struct module_attribute *,
 				     struct module_kobject *, char *);
-- 
cgit v1.2.3


From b112082c8930e7aa72422484b2d31d3aa06f58bc Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:15 +0100
Subject: module: simplify version-attribute handling

Instead of using the array-of-pointers trick to avoid having gcc mess up
the built-in module-version array stride, specify type alignment when
declaring entries to prevent gcc from increasing alignment.

This is essentially an alternative (one-line) fix to the problem
addressed by commit b4bc842802db ("module: deal with alignment issues in
built-in module versions").

gcc can increase the alignment of larger objects with static extent as
an optimisation, but this can be suppressed by using the aligned
attribute when declaring variables.

Note that we have been relying on this behaviour for kernel parameters
for 16 years and it indeed hasn't changed since the introduction of the
aligned attribute in gcc-3.1.

Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 26 +++++++++++++-------------
 kernel/params.c        | 10 ++++------
 2 files changed, 17 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 293250958512..5958075ea3f4 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -266,20 +266,20 @@ extern typeof(name) __mod_##type##__##name##_device_table		\
 #else
 #define MODULE_VERSION(_version)					\
 	MODULE_INFO(version, _version);					\
-	static struct module_version_attribute ___modver_attr = {	\
-		.mattr	= {						\
-			.attr	= {					\
-				.name	= "version",			\
-				.mode	= S_IRUGO,			\
+	static struct module_version_attribute __modver_attr		\
+		__used __section("__modver")				\
+		__aligned(__alignof__(struct module_version_attribute)) \
+		= {							\
+			.mattr	= {					\
+				.attr	= {				\
+					.name	= "version",		\
+					.mode	= S_IRUGO,		\
+				},					\
+				.show	= __modver_version_show,	\
 			},						\
-			.show	= __modver_version_show,		\
-		},							\
-		.module_name	= KBUILD_MODNAME,			\
-		.version	= _version,				\
-	};								\
-	static const struct module_version_attribute			\
-	__used __section("__modver")					\
-	* __moduleparam_const __modver_attr = &___modver_attr
+			.module_name	= KBUILD_MODNAME,		\
+			.version	= _version,			\
+		};
 #endif
 
 /* Optional firmware file (or files) needed by the module
diff --git a/kernel/params.c b/kernel/params.c
index 3835fb82c64b..aa7d6f2213f1 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -843,18 +843,16 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
 	return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
 }
 
-extern const struct module_version_attribute *__start___modver[];
-extern const struct module_version_attribute *__stop___modver[];
+extern const struct module_version_attribute __start___modver[];
+extern const struct module_version_attribute __stop___modver[];
 
 static void __init version_sysfs_builtin(void)
 {
-	const struct module_version_attribute **p;
+	const struct module_version_attribute *vattr;
 	struct module_kobject *mk;
 	int err;
 
-	for (p = __start___modver; p < __stop___modver; p++) {
-		const struct module_version_attribute *vattr = *p;
-
+	for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
 		mk = locate_module_kobject(vattr->module_name);
 		if (mk) {
 			err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
-- 
cgit v1.2.3


From 8d6615f1fccc4f39d7d3dcf286b33e8a1e833d2b Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:17 +0100
Subject: params: drop redundant "unused" attributes

Drop the redundant "unused" attributes from module-parameter structures
already marked "used".

Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/moduleparam.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 6388eb9734a5..742074ad9f6e 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -22,7 +22,7 @@
 
 #define __MODULE_INFO(tag, name, info)					  \
 static const char __UNIQUE_ID(name)[]					  \
-  __used __section(".modinfo") __attribute__((unused, aligned(1)))	  \
+  __used __section(".modinfo") __attribute__((aligned(1)))		  \
   = __MODULE_INFO_PREFIX __stringify(tag) "=" info
 
 #define __MODULE_PARM_TYPE(name, _type)					  \
@@ -289,7 +289,7 @@ struct kparam_array
 	static const char __param_str_##name[] = prefix #name;		\
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used								\
-    __section("__param") __attribute__ ((unused, aligned(sizeof(void *)))) \
+	__section("__param") __attribute__ ((aligned(sizeof(void *))))  \
 	= { __param_str_##name, THIS_MODULE, ops,			\
 	    VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } }
 
-- 
cgit v1.2.3


From fe2f4fe139b321a38daafc715aeb7d21d9e8e5ad Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:18 +0100
Subject: params: use type alignment for kernel parameters

Specify type alignment for kernel parameters instead of sizeof(void *).

The alignment attribute is used to prevent gcc from increasing the
alignment of objects with static extent as an optimisation, something
which would mess up the __param array stride.

Using __alignof__(struct kernel_param) rather than sizeof(void *) is
preferred since it better indicates why it is there and doesn't break
should the type size or alignment change.

Note that on m68k the alignment of struct kernel_param is actually two
and that adding a 1- or 2-byte field to the 20-byte struct would cause a
breakage with the current 4-byte alignment.

Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/moduleparam.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 742074ad9f6e..15ecc6cc3a3b 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -288,8 +288,8 @@ struct kparam_array
 	/* Default value instead of permissions? */			\
 	static const char __param_str_##name[] = prefix #name;		\
 	static struct kernel_param __moduleparam_const __param_##name	\
-	__used								\
-	__section("__param") __attribute__ ((aligned(sizeof(void *))))  \
+	__used __section("__param")					\
+	__aligned(__alignof__(struct kernel_param))			\
 	= { __param_str_##name, THIS_MODULE, ops,			\
 	    VERIFY_OCTAL_PERMISSIONS(perm), level, flags, { arg } }
 
-- 
cgit v1.2.3


From 2aec389e19150ed3bf67ab708f2435563f76050f Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:19 +0100
Subject: params: clean up module-param macros

Clean up the module-param macros by adding some indentation and using
the __aligned() macro to improve readability.

Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/moduleparam.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 15ecc6cc3a3b..eed280fae433 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -21,12 +21,12 @@
 #define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long))
 
 #define __MODULE_INFO(tag, name, info)					  \
-static const char __UNIQUE_ID(name)[]					  \
-  __used __section(".modinfo") __attribute__((aligned(1)))		  \
-  = __MODULE_INFO_PREFIX __stringify(tag) "=" info
+	static const char __UNIQUE_ID(name)[]				  \
+		__used __section(".modinfo") __aligned(1)		  \
+		= __MODULE_INFO_PREFIX __stringify(tag) "=" info
 
 #define __MODULE_PARM_TYPE(name, _type)					  \
-  __MODULE_INFO(parmtype, name##type, #name ":" _type)
+	__MODULE_INFO(parmtype, name##type, #name ":" _type)
 
 /* One for each parameter, describing how to use it.  Some files do
    multiple of these per line, so can't just use MODULE_INFO. */
-- 
cgit v1.2.3


From 640586f8af356096e084d69a9909d217852bde48 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 19 Nov 2020 17:02:21 +0100
Subject: powerpc/ptrace: Simplify gpr_get()/tm_cgpr_get()

gpr_get() does membuf_write() twice to override pt_regs->msr in
between. We can call membuf_write() once and change ->msr in the
kernel buffer, this simplifies the code and the next fix.

The patch adds a new simple helper, membuf_at(offs), it returns the
new membuf which can be safely used after membuf_write().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
[mpe: Fixup some minor whitespace issues noticed by Christophe]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20201119160221.GA5188@redhat.com
---
 arch/powerpc/kernel/ptrace/ptrace-tm.c   | 12 ++++--------
 arch/powerpc/kernel/ptrace/ptrace-view.c | 10 +++-------
 include/linux/regset.h                   | 12 ++++++++++++
 3 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/ptrace/ptrace-tm.c b/arch/powerpc/kernel/ptrace/ptrace-tm.c
index 54f2d076206f..f15cbbab45b7 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-tm.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-tm.c
@@ -86,6 +86,8 @@ int tm_cgpr_active(struct task_struct *target, const struct user_regset *regset)
 int tm_cgpr_get(struct task_struct *target, const struct user_regset *regset,
 		struct membuf to)
 {
+	struct membuf to_msr = membuf_at(&to, offsetof(struct pt_regs, msr));
+
 	if (!cpu_has_feature(CPU_FTR_TM))
 		return -ENODEV;
 
@@ -96,16 +98,10 @@ int tm_cgpr_get(struct task_struct *target, const struct user_regset *regset,
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
 
-	membuf_write(&to, &target->thread.ckpt_regs,
-			offsetof(struct pt_regs, msr));
-	membuf_store(&to, get_user_ckpt_msr(target));
+	membuf_write(&to, &target->thread.ckpt_regs, sizeof(struct user_pt_regs));
 
-	BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) !=
-		     offsetof(struct pt_regs, msr) + sizeof(long));
+	membuf_store(&to_msr, get_user_ckpt_msr(target));
 
-	membuf_write(&to, &target->thread.ckpt_regs.orig_gpr3,
-			sizeof(struct user_pt_regs) -
-			offsetof(struct pt_regs, orig_gpr3));
 	return membuf_zero(&to, ELF_NGREG * sizeof(unsigned long) -
 			sizeof(struct user_pt_regs));
 }
diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c
index 7e6478e7ed07..299e0b6d709d 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-view.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-view.c
@@ -217,6 +217,7 @@ int ptrace_put_reg(struct task_struct *task, int regno, unsigned long data)
 static int gpr_get(struct task_struct *target, const struct user_regset *regset,
 		   struct membuf to)
 {
+	struct membuf to_msr = membuf_at(&to, offsetof(struct pt_regs, msr));
 	int i;
 
 	if (target->thread.regs == NULL)
@@ -228,15 +229,10 @@ static int gpr_get(struct task_struct *target, const struct user_regset *regset,
 			target->thread.regs->gpr[i] = NV_REG_POISON;
 	}
 
-	membuf_write(&to, target->thread.regs, offsetof(struct pt_regs, msr));
-	membuf_store(&to, get_user_msr(target));
+	membuf_write(&to, target->thread.regs, sizeof(struct user_pt_regs));
 
-	BUILD_BUG_ON(offsetof(struct pt_regs, orig_gpr3) !=
-		     offsetof(struct pt_regs, msr) + sizeof(long));
+	membuf_store(&to_msr, get_user_msr(target));
 
-	membuf_write(&to, &target->thread.regs->orig_gpr3,
-			sizeof(struct user_pt_regs) -
-			offsetof(struct pt_regs, orig_gpr3));
 	return membuf_zero(&to, ELF_NGREG * sizeof(unsigned long) -
 				 sizeof(struct user_pt_regs));
 }
diff --git a/include/linux/regset.h b/include/linux/regset.h
index c3403f328257..a00765f0e8cf 100644
--- a/include/linux/regset.h
+++ b/include/linux/regset.h
@@ -46,6 +46,18 @@ static inline int membuf_write(struct membuf *s, const void *v, size_t size)
 	return s->left;
 }
 
+static inline struct membuf membuf_at(const struct membuf *s, size_t offs)
+{
+	struct membuf n = *s;
+
+	if (offs > n.left)
+		offs = n.left;
+	n.p += offs;
+	n.left -= offs;
+
+	return n;
+}
+
 /* current s->p must be aligned for v; v must be a scalar */
 #define membuf_store(s, v)				\
 ({							\
-- 
cgit v1.2.3


From 8d8d53cf8fd028310b1189165b939cde124895d7 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 29 Oct 2020 12:52:40 +1100
Subject: dma-mapping: Allow mixing bypass and mapped DMA operation

At the moment we allow bypassing DMA ops only when we can do this for
the entire RAM. However there are configs with mixed type memory
where we could still allow bypassing IOMMU in most cases;
POWERPC with persistent memory is one example.

This adds an arch hook to determine where bypass can still work and
we invoke direct DMA API. The following patch checks the bus limit
on POWERPC to allow or disallow direct mapping.

This adds a ARCH_HAS_DMA_MAP_DIRECT config option to make the arch_xxxx
hooks no-op by default.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-map-ops.h | 14 ++++++++++++++
 kernel/dma/Kconfig          |  4 ++++
 kernel/dma/mapping.c        | 12 ++++++++----
 3 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index a5f89fc4d6df..38c8a4558e08 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -314,6 +314,20 @@ static inline void arch_dma_mark_clean(phys_addr_t paddr, size_t size)
 void *arch_dma_set_uncached(void *addr, size_t size);
 void arch_dma_clear_uncached(void *addr, size_t size);
 
+#ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT
+bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr);
+bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle);
+bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg,
+		int nents);
+bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg,
+		int nents);
+#else
+#define arch_dma_map_page_direct(d, a)		(false)
+#define arch_dma_unmap_page_direct(d, a)	(false)
+#define arch_dma_map_sg_direct(d, s, n)		(false)
+#define arch_dma_unmap_sg_direct(d, s, n)	(false)
+#endif
+
 #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		const struct iommu_ops *iommu, bool coherent);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index c99de4a21458..43d106598e82 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -20,6 +20,10 @@ config DMA_OPS
 config DMA_OPS_BYPASS
 	bool
 
+# Lets platform IOMMU driver choose between bypass and IOMMU
+config ARCH_HAS_DMA_MAP_DIRECT
+	bool
+
 config NEED_SG_DMA_LENGTH
 	bool
 
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 51bb8fa8eb89..f87a89d08654 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -149,7 +149,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
 	if (WARN_ON_ONCE(!dev->dma_mask))
 		return DMA_MAPPING_ERROR;
 
-	if (dma_map_direct(dev, ops))
+	if (dma_map_direct(dev, ops) ||
+	    arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
 		addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
 	else
 		addr = ops->map_page(dev, page, offset, size, dir, attrs);
@@ -165,7 +166,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	BUG_ON(!valid_dma_direction(dir));
-	if (dma_map_direct(dev, ops))
+	if (dma_map_direct(dev, ops) ||
+	    arch_dma_unmap_page_direct(dev, addr + size))
 		dma_direct_unmap_page(dev, addr, size, dir, attrs);
 	else if (ops->unmap_page)
 		ops->unmap_page(dev, addr, size, dir, attrs);
@@ -188,7 +190,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
 	if (WARN_ON_ONCE(!dev->dma_mask))
 		return 0;
 
-	if (dma_map_direct(dev, ops))
+	if (dma_map_direct(dev, ops) ||
+	    arch_dma_map_sg_direct(dev, sg, nents))
 		ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
 	else
 		ents = ops->map_sg(dev, sg, nents, dir, attrs);
@@ -207,7 +210,8 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
 
 	BUG_ON(!valid_dma_direction(dir));
 	debug_dma_unmap_sg(dev, sg, nents, dir);
-	if (dma_map_direct(dev, ops))
+	if (dma_map_direct(dev, ops) ||
+	    arch_dma_unmap_sg_direct(dev, sg, nents))
 		dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
 	else if (ops->unmap_sg)
 		ops->unmap_sg(dev, sg, nents, dir, attrs);
-- 
cgit v1.2.3


From 928296ea5da37838d7127de4b10f47cd97401b13 Mon Sep 17 00:00:00 2001
From: Matthias Brugger <mbrugger@suse.com>
Date: Fri, 30 Oct 2020 12:36:11 +0100
Subject: soc: mediatek: pm_domains: Make bus protection generic

Bus protection is not exclusively done by calling the infracfg misc driver.
Make the calls for setting and clearing the bus protection generic so
that we can use other blocks for it as well.

Signed-off-by: Matthias Brugger <mbrugger@suse.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20201030113622.201188-6-enric.balletbo@collabora.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/Kconfig          |  1 -
 drivers/soc/mediatek/mtk-infracfg.c   |  5 ---
 drivers/soc/mediatek/mtk-pm-domains.c | 57 +++++++++++++++++++++++++++--------
 include/linux/soc/mediatek/infracfg.h |  5 +++
 4 files changed, 49 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/mediatek/Kconfig b/drivers/soc/mediatek/Kconfig
index 68d800f9e4a5..67cef12dc585 100644
--- a/drivers/soc/mediatek/Kconfig
+++ b/drivers/soc/mediatek/Kconfig
@@ -48,7 +48,6 @@ config MTK_SCPSYS_PM_DOMAINS
 	bool "MediaTek SCPSYS generic power domain"
 	default ARCH_MEDIATEK
 	depends on PM
-	depends on MTK_INFRACFG
 	select PM_GENERIC_DOMAINS
 	select REGMAP
 	help
diff --git a/drivers/soc/mediatek/mtk-infracfg.c b/drivers/soc/mediatek/mtk-infracfg.c
index 4a123796aad3..0590b68e0d78 100644
--- a/drivers/soc/mediatek/mtk-infracfg.c
+++ b/drivers/soc/mediatek/mtk-infracfg.c
@@ -12,11 +12,6 @@
 #define MTK_POLL_DELAY_US   10
 #define MTK_POLL_TIMEOUT    (jiffies_to_usecs(HZ))
 
-#define INFRA_TOPAXI_PROTECTEN		0x0220
-#define INFRA_TOPAXI_PROTECTSTA1	0x0228
-#define INFRA_TOPAXI_PROTECTEN_SET	0x0260
-#define INFRA_TOPAXI_PROTECTEN_CLR	0x0264
-
 /**
  * mtk_infracfg_set_bus_protection - enable bus protection
  * @infracfg: The infracfg regmap
diff --git a/drivers/soc/mediatek/mtk-pm-domains.c b/drivers/soc/mediatek/mtk-pm-domains.c
index 06a16e45356a..6122701d018f 100644
--- a/drivers/soc/mediatek/mtk-pm-domains.c
+++ b/drivers/soc/mediatek/mtk-pm-domains.c
@@ -86,18 +86,24 @@ static int scpsys_sram_disable(struct scpsys_domain *pd)
 					MTK_POLL_TIMEOUT);
 }
 
-static int scpsys_bus_protect_enable(struct scpsys_domain *pd)
+static int _scpsys_bus_protect_enable(const struct scpsys_bus_prot_data *bpd, struct regmap *regmap)
 {
-	const struct scpsys_bus_prot_data *bpd = pd->data->bp_infracfg;
 	int i, ret;
 
 	for (i = 0; i < SPM_MAX_BUS_PROT_DATA; i++) {
-		if (!bpd[i].bus_prot_mask)
+		u32 val, mask = bpd[i].bus_prot_mask;
+
+		if (!mask)
 			break;
 
-		ret = mtk_infracfg_set_bus_protection(pd->infracfg,
-						      bpd[i].bus_prot_mask,
-						      bpd[i].bus_prot_reg_update);
+		if (bpd[i].bus_prot_reg_update)
+			regmap_set_bits(regmap, bpd[i].bus_prot_set, mask);
+		else
+			regmap_write(regmap, INFRA_TOPAXI_PROTECTEN_SET, mask);
+
+		ret = regmap_read_poll_timeout(regmap, INFRA_TOPAXI_PROTECTSTA1,
+					       val, (val & mask) == mask,
+					       MTK_POLL_DELAY_US, MTK_POLL_TIMEOUT);
 		if (ret)
 			return ret;
 	}
@@ -105,18 +111,34 @@ static int scpsys_bus_protect_enable(struct scpsys_domain *pd)
 	return 0;
 }
 
-static int scpsys_bus_protect_disable(struct scpsys_domain *pd)
+static int scpsys_bus_protect_enable(struct scpsys_domain *pd)
+{
+	int ret;
+
+	ret = _scpsys_bus_protect_enable(pd->data->bp_infracfg, pd->infracfg);
+
+	return ret;
+}
+
+static int _scpsys_bus_protect_disable(const struct scpsys_bus_prot_data *bpd,
+				       struct regmap *regmap)
 {
-	const struct scpsys_bus_prot_data *bpd = pd->data->bp_infracfg;
 	int i, ret;
 
-	for (i = SPM_MAX_BUS_PROT_DATA; i > 0; i--) {
-		if (!bpd[i].bus_prot_mask)
+	for (i = SPM_MAX_BUS_PROT_DATA - 1; i >= 0; i--) {
+		u32 val, mask = bpd[i].bus_prot_mask;
+
+		if (!mask)
 			continue;
 
-		ret = mtk_infracfg_clear_bus_protection(pd->infracfg,
-							bpd[i].bus_prot_mask,
-							bpd[i].bus_prot_reg_update);
+		if (bpd[i].bus_prot_reg_update)
+			regmap_clear_bits(regmap, bpd[i].bus_prot_clr, mask);
+		else
+			regmap_write(regmap, INFRA_TOPAXI_PROTECTEN_CLR, mask);
+
+		ret = regmap_read_poll_timeout(regmap, INFRA_TOPAXI_PROTECTSTA1,
+					       val, !(val & mask),
+					       MTK_POLL_DELAY_US, MTK_POLL_TIMEOUT);
 		if (ret)
 			return ret;
 	}
@@ -124,6 +146,15 @@ static int scpsys_bus_protect_disable(struct scpsys_domain *pd)
 	return 0;
 }
 
+static int scpsys_bus_protect_disable(struct scpsys_domain *pd)
+{
+	int ret;
+
+	ret = _scpsys_bus_protect_disable(pd->data->bp_infracfg, pd->infracfg);
+
+	return ret;
+}
+
 static int scpsys_power_on(struct generic_pm_domain *genpd)
 {
 	struct scpsys_domain *pd = container_of(genpd, struct scpsys_domain, genpd);
diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h
index 233463d789c6..5bcaab767f6a 100644
--- a/include/linux/soc/mediatek/infracfg.h
+++ b/include/linux/soc/mediatek/infracfg.h
@@ -32,6 +32,11 @@
 #define MT7622_TOP_AXI_PROT_EN_WB		(BIT(2) | BIT(6) | \
 						 BIT(7) | BIT(8))
 
+#define INFRA_TOPAXI_PROTECTEN			0x0220
+#define INFRA_TOPAXI_PROTECTSTA1		0x0228
+#define INFRA_TOPAXI_PROTECTEN_SET		0x0260
+#define INFRA_TOPAXI_PROTECTEN_CLR		0x0264
+
 #define REG_INFRA_MISC				0xf00
 #define F_DDR_4GB_SUPPORT_EN			BIT(13)
 
-- 
cgit v1.2.3


From eb9fa767fbe19d3db7d303e9fde7f3056221ffe1 Mon Sep 17 00:00:00 2001
From: Matthias Brugger <mbrugger@suse.com>
Date: Fri, 30 Oct 2020 12:36:17 +0100
Subject: soc: mediatek: pm-domains: Add support for mt8183

Add the needed board data to support mt8183 SoC.

Signed-off-by: Matthias Brugger <mbrugger@suse.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Link: https://lore.kernel.org/r/20201030113622.201188-12-enric.balletbo@collabora.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/mt8183-pm-domains.h | 221 +++++++++++++++++++++++++++++++
 drivers/soc/mediatek/mtk-pm-domains.c    |   5 +
 drivers/soc/mediatek/mtk-pm-domains.h    |   1 +
 include/linux/soc/mediatek/infracfg.h    |  46 +++++++
 4 files changed, 273 insertions(+)
 create mode 100644 drivers/soc/mediatek/mt8183-pm-domains.h

(limited to 'include/linux')

diff --git a/drivers/soc/mediatek/mt8183-pm-domains.h b/drivers/soc/mediatek/mt8183-pm-domains.h
new file mode 100644
index 000000000000..8d996c5d2682
--- /dev/null
+++ b/drivers/soc/mediatek/mt8183-pm-domains.h
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __SOC_MEDIATEK_MT8183_PM_DOMAINS_H
+#define __SOC_MEDIATEK_MT8183_PM_DOMAINS_H
+
+#include "mtk-pm-domains.h"
+#include <dt-bindings/power/mt8183-power.h>
+
+/*
+ * MT8183 power domain support
+ */
+
+static const struct scpsys_domain_data scpsys_domain_data_mt8183[] = {
+	[MT8183_POWER_DOMAIN_AUDIO] = {
+		.sta_mask = PWR_STATUS_AUDIO,
+		.ctl_offs = 0x0314,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(15, 12),
+	},
+	[MT8183_POWER_DOMAIN_CONN] = {
+		.sta_mask = PWR_STATUS_CONN,
+		.ctl_offs = 0x032c,
+		.sram_pdn_bits = 0,
+		.sram_pdn_ack_bits = 0,
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_CONN, MT8183_TOP_AXI_PROT_EN_SET,
+				    MT8183_TOP_AXI_PROT_EN_CLR, MT8183_TOP_AXI_PROT_EN_STA1),
+		},
+	},
+	[MT8183_POWER_DOMAIN_MFG_ASYNC] = {
+		.sta_mask = PWR_STATUS_MFG_ASYNC,
+		.ctl_offs = 0x0334,
+		.sram_pdn_bits = 0,
+		.sram_pdn_ack_bits = 0,
+	},
+	[MT8183_POWER_DOMAIN_MFG] = {
+		.sta_mask = PWR_STATUS_MFG,
+		.ctl_offs = 0x0338,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8183_POWER_DOMAIN_MFG_CORE0] = {
+		.sta_mask = BIT(7),
+		.ctl_offs = 0x034c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8183_POWER_DOMAIN_MFG_CORE1] = {
+		.sta_mask = BIT(20),
+		.ctl_offs = 0x0310,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8183_POWER_DOMAIN_MFG_2D] = {
+		.sta_mask = PWR_STATUS_MFG_2D,
+		.ctl_offs = 0x0348,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_1_MFG, MT8183_TOP_AXI_PROT_EN_1_SET,
+				    MT8183_TOP_AXI_PROT_EN_1_CLR, MT8183_TOP_AXI_PROT_EN_STA1_1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MFG, MT8183_TOP_AXI_PROT_EN_SET,
+				    MT8183_TOP_AXI_PROT_EN_CLR, MT8183_TOP_AXI_PROT_EN_STA1),
+		},
+	},
+	[MT8183_POWER_DOMAIN_DISP] = {
+		.sta_mask = PWR_STATUS_DISP,
+		.ctl_offs = 0x030c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_1_DISP, MT8183_TOP_AXI_PROT_EN_1_SET,
+				    MT8183_TOP_AXI_PROT_EN_1_CLR, MT8183_TOP_AXI_PROT_EN_STA1_1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_DISP, MT8183_TOP_AXI_PROT_EN_SET,
+				    MT8183_TOP_AXI_PROT_EN_CLR, MT8183_TOP_AXI_PROT_EN_STA1),
+		},
+		.bp_smi = {
+			BUS_PROT_WR(MT8183_SMI_COMMON_SMI_CLAMP_DISP,
+				    MT8183_SMI_COMMON_CLAMP_EN_SET,
+				    MT8183_SMI_COMMON_CLAMP_EN_CLR,
+				    MT8183_SMI_COMMON_CLAMP_EN),
+		},
+	},
+	[MT8183_POWER_DOMAIN_CAM] = {
+		.sta_mask = BIT(25),
+		.ctl_offs = 0x0344,
+		.sram_pdn_bits = GENMASK(9, 8),
+		.sram_pdn_ack_bits = GENMASK(13, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MM_CAM, MT8183_TOP_AXI_PROT_EN_MM_SET,
+				    MT8183_TOP_AXI_PROT_EN_MM_CLR, MT8183_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_CAM, MT8183_TOP_AXI_PROT_EN_SET,
+				    MT8183_TOP_AXI_PROT_EN_CLR, MT8183_TOP_AXI_PROT_EN_STA1),
+			BUS_PROT_WR_IGN(MT8183_TOP_AXI_PROT_EN_MM_CAM_2ND,
+					MT8183_TOP_AXI_PROT_EN_MM_SET,
+					MT8183_TOP_AXI_PROT_EN_MM_CLR,
+					MT8183_TOP_AXI_PROT_EN_MM_STA1),
+		},
+		.bp_smi = {
+			BUS_PROT_WR(MT8183_SMI_COMMON_SMI_CLAMP_CAM,
+				    MT8183_SMI_COMMON_CLAMP_EN_SET,
+				    MT8183_SMI_COMMON_CLAMP_EN_CLR,
+				    MT8183_SMI_COMMON_CLAMP_EN),
+		},
+	},
+	[MT8183_POWER_DOMAIN_ISP] = {
+		.sta_mask = PWR_STATUS_ISP,
+		.ctl_offs = 0x0308,
+		.sram_pdn_bits = GENMASK(9, 8),
+		.sram_pdn_ack_bits = GENMASK(13, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MM_ISP,
+				    MT8183_TOP_AXI_PROT_EN_MM_SET,
+				    MT8183_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR_IGN(MT8183_TOP_AXI_PROT_EN_MM_ISP_2ND,
+					MT8183_TOP_AXI_PROT_EN_MM_SET,
+					MT8183_TOP_AXI_PROT_EN_MM_CLR,
+					MT8183_TOP_AXI_PROT_EN_MM_STA1),
+		},
+		.bp_smi = {
+			BUS_PROT_WR(MT8183_SMI_COMMON_SMI_CLAMP_ISP,
+				    MT8183_SMI_COMMON_CLAMP_EN_SET,
+				    MT8183_SMI_COMMON_CLAMP_EN_CLR,
+				    MT8183_SMI_COMMON_CLAMP_EN),
+		},
+	},
+	[MT8183_POWER_DOMAIN_VDEC] = {
+		.sta_mask = BIT(31),
+		.ctl_offs = 0x0300,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_smi = {
+			BUS_PROT_WR(MT8183_SMI_COMMON_SMI_CLAMP_VDEC,
+				    MT8183_SMI_COMMON_CLAMP_EN_SET,
+				    MT8183_SMI_COMMON_CLAMP_EN_CLR,
+				    MT8183_SMI_COMMON_CLAMP_EN),
+		},
+	},
+	[MT8183_POWER_DOMAIN_VENC] = {
+		.sta_mask = PWR_STATUS_VENC,
+		.ctl_offs = 0x0304,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(15, 12),
+		.bp_smi = {
+			BUS_PROT_WR(MT8183_SMI_COMMON_SMI_CLAMP_VENC,
+				    MT8183_SMI_COMMON_CLAMP_EN_SET,
+				    MT8183_SMI_COMMON_CLAMP_EN_CLR,
+				    MT8183_SMI_COMMON_CLAMP_EN),
+		},
+	},
+	[MT8183_POWER_DOMAIN_VPU_TOP] = {
+		.sta_mask = BIT(26),
+		.ctl_offs = 0x0324,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MM_VPU_TOP,
+				    MT8183_TOP_AXI_PROT_EN_MM_SET,
+				    MT8183_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_VPU_TOP,
+				    MT8183_TOP_AXI_PROT_EN_SET,
+				    MT8183_TOP_AXI_PROT_EN_CLR,
+				    MT8183_TOP_AXI_PROT_EN_STA1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MM_VPU_TOP_2ND,
+				    MT8183_TOP_AXI_PROT_EN_MM_SET,
+				    MT8183_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MM_STA1),
+		},
+		.bp_smi = {
+			BUS_PROT_WR(MT8183_SMI_COMMON_SMI_CLAMP_VPU_TOP,
+				    MT8183_SMI_COMMON_CLAMP_EN_SET,
+				    MT8183_SMI_COMMON_CLAMP_EN_CLR,
+				    MT8183_SMI_COMMON_CLAMP_EN),
+		},
+	},
+	[MT8183_POWER_DOMAIN_VPU_CORE0] = {
+		.sta_mask = BIT(27),
+		.ctl_offs = 0x33c,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(13, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE0,
+				    MT8183_TOP_AXI_PROT_EN_MCU_SET,
+				    MT8183_TOP_AXI_PROT_EN_MCU_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MCU_STA1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE0_2ND,
+				    MT8183_TOP_AXI_PROT_EN_MCU_SET,
+				    MT8183_TOP_AXI_PROT_EN_MCU_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MCU_STA1),
+		},
+		.caps = MTK_SCPD_SRAM_ISO,
+	},
+	[MT8183_POWER_DOMAIN_VPU_CORE1] = {
+		.sta_mask = BIT(28),
+		.ctl_offs = 0x0340,
+		.sram_pdn_bits = GENMASK(11, 8),
+		.sram_pdn_ack_bits = GENMASK(13, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE1,
+				    MT8183_TOP_AXI_PROT_EN_MCU_SET,
+				    MT8183_TOP_AXI_PROT_EN_MCU_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MCU_STA1),
+			BUS_PROT_WR(MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE1_2ND,
+				    MT8183_TOP_AXI_PROT_EN_MCU_SET,
+				    MT8183_TOP_AXI_PROT_EN_MCU_CLR,
+				    MT8183_TOP_AXI_PROT_EN_MCU_STA1),
+		},
+		.caps = MTK_SCPD_SRAM_ISO,
+	},
+};
+
+static const struct scpsys_soc_data mt8183_scpsys_data = {
+	.domains_data = scpsys_domain_data_mt8183,
+	.num_domains = ARRAY_SIZE(scpsys_domain_data_mt8183),
+	.pwr_sta_offs = 0x0180,
+	.pwr_sta2nd_offs = 0x0184
+};
+
+#endif /* __SOC_MEDIATEK_MT8183_PM_DOMAINS_H */
diff --git a/drivers/soc/mediatek/mtk-pm-domains.c b/drivers/soc/mediatek/mtk-pm-domains.c
index 03279a999dfc..8703d50cd2b7 100644
--- a/drivers/soc/mediatek/mtk-pm-domains.c
+++ b/drivers/soc/mediatek/mtk-pm-domains.c
@@ -16,6 +16,7 @@
 #include <linux/soc/mediatek/infracfg.h>
 
 #include "mt8173-pm-domains.h"
+#include "mt8183-pm-domains.h"
 
 #define MTK_POLL_DELAY_US		10
 #define MTK_POLL_TIMEOUT		USEC_PER_SEC
@@ -505,6 +506,10 @@ static const struct of_device_id scpsys_of_match[] = {
 		.compatible = "mediatek,mt8173-power-controller",
 		.data = &mt8173_scpsys_data,
 	},
+	{
+		.compatible = "mediatek,mt8183-power-controller",
+		.data = &mt8183_scpsys_data,
+	},
 	{ }
 };
 
diff --git a/drivers/soc/mediatek/mtk-pm-domains.h b/drivers/soc/mediatek/mtk-pm-domains.h
index 809d2d43f01d..2c745f11b422 100644
--- a/drivers/soc/mediatek/mtk-pm-domains.h
+++ b/drivers/soc/mediatek/mtk-pm-domains.h
@@ -22,6 +22,7 @@
 #define SPM_PWR_STATUS			0x060c
 #define SPM_PWR_STATUS_2ND		0x0610
 
+#define PWR_STATUS_CONN			BIT(1)
 #define PWR_STATUS_DISP			BIT(3)
 #define PWR_STATUS_MFG			BIT(4)
 #define PWR_STATUS_ISP			BIT(5)
diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h
index 5bcaab767f6a..9d01e32e19bc 100644
--- a/include/linux/soc/mediatek/infracfg.h
+++ b/include/linux/soc/mediatek/infracfg.h
@@ -2,6 +2,52 @@
 #ifndef __SOC_MEDIATEK_INFRACFG_H
 #define __SOC_MEDIATEK_INFRACFG_H
 
+#define MT8183_TOP_AXI_PROT_EN_STA1			0x228
+#define MT8183_TOP_AXI_PROT_EN_STA1_1			0x258
+#define MT8183_TOP_AXI_PROT_EN_SET			0x2a0
+#define MT8183_TOP_AXI_PROT_EN_CLR			0x2a4
+#define MT8183_TOP_AXI_PROT_EN_1_SET			0x2a8
+#define MT8183_TOP_AXI_PROT_EN_1_CLR			0x2ac
+#define MT8183_TOP_AXI_PROT_EN_MCU_SET			0x2c4
+#define MT8183_TOP_AXI_PROT_EN_MCU_CLR			0x2c8
+#define MT8183_TOP_AXI_PROT_EN_MCU_STA1			0x2e4
+#define MT8183_TOP_AXI_PROT_EN_MM_SET			0x2d4
+#define MT8183_TOP_AXI_PROT_EN_MM_CLR			0x2d8
+#define MT8183_TOP_AXI_PROT_EN_MM_STA1			0x2ec
+
+#define MT8183_TOP_AXI_PROT_EN_DISP			(BIT(10) | BIT(11))
+#define MT8183_TOP_AXI_PROT_EN_CONN			(BIT(13) | BIT(14))
+#define MT8183_TOP_AXI_PROT_EN_MFG			(BIT(21) | BIT(22))
+#define MT8183_TOP_AXI_PROT_EN_CAM			BIT(28)
+#define MT8183_TOP_AXI_PROT_EN_VPU_TOP			BIT(27)
+#define MT8183_TOP_AXI_PROT_EN_1_DISP			(BIT(16) | BIT(17))
+#define MT8183_TOP_AXI_PROT_EN_1_MFG			GENMASK(21, 19)
+#define MT8183_TOP_AXI_PROT_EN_MM_ISP			(BIT(3) | BIT(8))
+#define MT8183_TOP_AXI_PROT_EN_MM_ISP_2ND		BIT(10)
+#define MT8183_TOP_AXI_PROT_EN_MM_CAM			(BIT(4) | BIT(5) | \
+							 BIT(9) | BIT(13))
+#define MT8183_TOP_AXI_PROT_EN_MM_VPU_TOP		(GENMASK(9, 6) | \
+							 BIT(12))
+#define MT8183_TOP_AXI_PROT_EN_MM_VPU_TOP_2ND		(BIT(10) | BIT(11))
+#define MT8183_TOP_AXI_PROT_EN_MM_CAM_2ND		BIT(11)
+#define MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE0_2ND	(BIT(0) | BIT(2) | \
+							 BIT(4))
+#define MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE1_2ND	(BIT(1) | BIT(3) | \
+							 BIT(5))
+#define MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE0		BIT(6)
+#define MT8183_TOP_AXI_PROT_EN_MCU_VPU_CORE1		BIT(7)
+
+#define MT8183_SMI_COMMON_CLAMP_EN			0x3c0
+#define MT8183_SMI_COMMON_CLAMP_EN_SET			0x3c4
+#define MT8183_SMI_COMMON_CLAMP_EN_CLR			0x3c8
+
+#define MT8183_SMI_COMMON_SMI_CLAMP_DISP		GENMASK(7, 0)
+#define MT8183_SMI_COMMON_SMI_CLAMP_VENC		BIT(1)
+#define MT8183_SMI_COMMON_SMI_CLAMP_ISP			BIT(2)
+#define MT8183_SMI_COMMON_SMI_CLAMP_CAM			(BIT(3) | BIT(4))
+#define MT8183_SMI_COMMON_SMI_CLAMP_VPU_TOP		(BIT(5) | BIT(6))
+#define MT8183_SMI_COMMON_SMI_CLAMP_VDEC		BIT(7)
+
 #define MT8173_TOP_AXI_PROT_EN_MCI_M2		BIT(0)
 #define MT8173_TOP_AXI_PROT_EN_MM_M0		BIT(1)
 #define MT8173_TOP_AXI_PROT_EN_MM_M1		BIT(2)
-- 
cgit v1.2.3


From a49d5e7a89d644a5c0ddc851be4bbf08614e6015 Mon Sep 17 00:00:00 2001
From: Weiyi Lu <weiyi.lu@mediatek.com>
Date: Fri, 30 Oct 2020 12:36:22 +0100
Subject: soc: mediatek: pm-domains: Add support for mt8192

Add the needed board data to support mt8192 SoC.

Signed-off-by: Weiyi Lu <weiyi.lu@mediatek.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Tested-by: Weiyi Lu <weiyi.lu@mediatek.com>
Link: https://lore.kernel.org/r/20201030113622.201188-17-enric.balletbo@collabora.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/soc/mediatek/mt8192-pm-domains.h | 292 +++++++++++++++++++++++++++++++
 drivers/soc/mediatek/mtk-pm-domains.c    |   5 +
 include/linux/soc/mediatek/infracfg.h    |  56 ++++++
 3 files changed, 353 insertions(+)
 create mode 100644 drivers/soc/mediatek/mt8192-pm-domains.h

(limited to 'include/linux')

diff --git a/drivers/soc/mediatek/mt8192-pm-domains.h b/drivers/soc/mediatek/mt8192-pm-domains.h
new file mode 100644
index 000000000000..0fdf6dc6231f
--- /dev/null
+++ b/drivers/soc/mediatek/mt8192-pm-domains.h
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __SOC_MEDIATEK_MT8192_PM_DOMAINS_H
+#define __SOC_MEDIATEK_MT8192_PM_DOMAINS_H
+
+#include "mtk-pm-domains.h"
+#include <dt-bindings/power/mt8192-power.h>
+
+/*
+ * MT8192 power domain support
+ */
+
+static const struct scpsys_domain_data scpsys_domain_data_mt8192[] = {
+	[MT8192_POWER_DOMAIN_AUDIO] = {
+		.sta_mask = BIT(21),
+		.ctl_offs = 0x0354,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_2_AUDIO,
+				    MT8192_TOP_AXI_PROT_EN_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_2_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_CONN] = {
+		.sta_mask = PWR_STATUS_CONN,
+		.ctl_offs = 0x0304,
+		.sram_pdn_bits = 0,
+		.sram_pdn_ack_bits = 0,
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_CONN,
+				    MT8192_TOP_AXI_PROT_EN_SET,
+				    MT8192_TOP_AXI_PROT_EN_CLR,
+				    MT8192_TOP_AXI_PROT_EN_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_CONN_2ND,
+				    MT8192_TOP_AXI_PROT_EN_SET,
+				    MT8192_TOP_AXI_PROT_EN_CLR,
+				    MT8192_TOP_AXI_PROT_EN_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_1_CONN,
+				    MT8192_TOP_AXI_PROT_EN_1_SET,
+				    MT8192_TOP_AXI_PROT_EN_1_CLR,
+				    MT8192_TOP_AXI_PROT_EN_1_STA1),
+		},
+		.caps = MTK_SCPD_KEEP_DEFAULT_OFF,
+	},
+	[MT8192_POWER_DOMAIN_MFG0] = {
+		.sta_mask = BIT(2),
+		.ctl_offs = 0x0308,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_MFG1] = {
+		.sta_mask = BIT(3),
+		.ctl_offs = 0x030c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_1_MFG1,
+				    MT8192_TOP_AXI_PROT_EN_1_SET,
+				    MT8192_TOP_AXI_PROT_EN_1_CLR,
+				    MT8192_TOP_AXI_PROT_EN_1_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_2_MFG1,
+				    MT8192_TOP_AXI_PROT_EN_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_2_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MFG1,
+				    MT8192_TOP_AXI_PROT_EN_SET,
+				    MT8192_TOP_AXI_PROT_EN_CLR,
+				    MT8192_TOP_AXI_PROT_EN_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_2_MFG1_2ND,
+				    MT8192_TOP_AXI_PROT_EN_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_2_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_MFG2] = {
+		.sta_mask = BIT(4),
+		.ctl_offs = 0x0310,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_MFG3] = {
+		.sta_mask = BIT(5),
+		.ctl_offs = 0x0314,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_MFG4] = {
+		.sta_mask = BIT(6),
+		.ctl_offs = 0x0318,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_MFG5] = {
+		.sta_mask = BIT(7),
+		.ctl_offs = 0x031c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_MFG6] = {
+		.sta_mask = BIT(8),
+		.ctl_offs = 0x0320,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_DISP] = {
+		.sta_mask = BIT(20),
+		.ctl_offs = 0x0350,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR_IGN(MT8192_TOP_AXI_PROT_EN_MM_DISP,
+					MT8192_TOP_AXI_PROT_EN_MM_SET,
+					MT8192_TOP_AXI_PROT_EN_MM_CLR,
+					MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR_IGN(MT8192_TOP_AXI_PROT_EN_MM_2_DISP,
+					MT8192_TOP_AXI_PROT_EN_MM_2_SET,
+					MT8192_TOP_AXI_PROT_EN_MM_2_CLR,
+					MT8192_TOP_AXI_PROT_EN_MM_2_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_DISP,
+				    MT8192_TOP_AXI_PROT_EN_SET,
+				    MT8192_TOP_AXI_PROT_EN_CLR,
+				    MT8192_TOP_AXI_PROT_EN_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_DISP_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_2_DISP_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_IPE] = {
+		.sta_mask = BIT(14),
+		.ctl_offs = 0x0338,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_IPE,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_IPE_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_ISP] = {
+		.sta_mask = BIT(12),
+		.ctl_offs = 0x0330,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_2_ISP,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_2_ISP_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_ISP2] = {
+		.sta_mask = BIT(13),
+		.ctl_offs = 0x0334,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_ISP2,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_ISP2_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_MDP] = {
+		.sta_mask = BIT(19),
+		.ctl_offs = 0x034c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_2_MDP,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_2_MDP_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_2_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_VENC] = {
+		.sta_mask = BIT(17),
+		.ctl_offs = 0x0344,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_VENC,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_VENC_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_VDEC] = {
+		.sta_mask = BIT(15),
+		.ctl_offs = 0x033c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_VDEC,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_VDEC_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_VDEC2] = {
+		.sta_mask = BIT(16),
+		.ctl_offs = 0x0340,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_CAM] = {
+		.sta_mask = BIT(23),
+		.ctl_offs = 0x035c,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+		.bp_infracfg = {
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_2_CAM,
+				    MT8192_TOP_AXI_PROT_EN_2_SET,
+				    MT8192_TOP_AXI_PROT_EN_2_CLR,
+				    MT8192_TOP_AXI_PROT_EN_2_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_CAM,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_1_CAM,
+				    MT8192_TOP_AXI_PROT_EN_1_SET,
+				    MT8192_TOP_AXI_PROT_EN_1_CLR,
+				    MT8192_TOP_AXI_PROT_EN_1_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_MM_CAM_2ND,
+				    MT8192_TOP_AXI_PROT_EN_MM_SET,
+				    MT8192_TOP_AXI_PROT_EN_MM_CLR,
+				    MT8192_TOP_AXI_PROT_EN_MM_STA1),
+			BUS_PROT_WR(MT8192_TOP_AXI_PROT_EN_VDNR_CAM,
+				    MT8192_TOP_AXI_PROT_EN_VDNR_SET,
+				    MT8192_TOP_AXI_PROT_EN_VDNR_CLR,
+				    MT8192_TOP_AXI_PROT_EN_VDNR_STA1),
+		},
+	},
+	[MT8192_POWER_DOMAIN_CAM_RAWA] = {
+		.sta_mask = BIT(24),
+		.ctl_offs = 0x0360,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_CAM_RAWB] = {
+		.sta_mask = BIT(25),
+		.ctl_offs = 0x0364,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+	[MT8192_POWER_DOMAIN_CAM_RAWC] = {
+		.sta_mask = BIT(26),
+		.ctl_offs = 0x0368,
+		.sram_pdn_bits = GENMASK(8, 8),
+		.sram_pdn_ack_bits = GENMASK(12, 12),
+	},
+};
+
+static const struct scpsys_soc_data mt8192_scpsys_data = {
+	.domains_data = scpsys_domain_data_mt8192,
+	.num_domains = ARRAY_SIZE(scpsys_domain_data_mt8192),
+	.pwr_sta_offs = 0x016c,
+	.pwr_sta2nd_offs = 0x0170,
+};
+
+#endif /* __SOC_MEDIATEK_MT8192_PM_DOMAINS_H */
diff --git a/drivers/soc/mediatek/mtk-pm-domains.c b/drivers/soc/mediatek/mtk-pm-domains.c
index c3b85b69f2f7..fb70cb3b07b3 100644
--- a/drivers/soc/mediatek/mtk-pm-domains.c
+++ b/drivers/soc/mediatek/mtk-pm-domains.c
@@ -17,6 +17,7 @@
 
 #include "mt8173-pm-domains.h"
 #include "mt8183-pm-domains.h"
+#include "mt8192-pm-domains.h"
 
 #define MTK_POLL_DELAY_US		10
 #define MTK_POLL_TIMEOUT		USEC_PER_SEC
@@ -521,6 +522,10 @@ static const struct of_device_id scpsys_of_match[] = {
 		.compatible = "mediatek,mt8183-power-controller",
 		.data = &mt8183_scpsys_data,
 	},
+	{
+		.compatible = "mediatek,mt8192-power-controller",
+		.data = &mt8192_scpsys_data,
+	},
 	{ }
 };
 
diff --git a/include/linux/soc/mediatek/infracfg.h b/include/linux/soc/mediatek/infracfg.h
index 9d01e32e19bc..e7842debc05d 100644
--- a/include/linux/soc/mediatek/infracfg.h
+++ b/include/linux/soc/mediatek/infracfg.h
@@ -2,6 +2,62 @@
 #ifndef __SOC_MEDIATEK_INFRACFG_H
 #define __SOC_MEDIATEK_INFRACFG_H
 
+#define MT8192_TOP_AXI_PROT_EN_STA1			0x228
+#define MT8192_TOP_AXI_PROT_EN_1_STA1			0x258
+#define MT8192_TOP_AXI_PROT_EN_SET			0x2a0
+#define MT8192_TOP_AXI_PROT_EN_CLR			0x2a4
+#define MT8192_TOP_AXI_PROT_EN_1_SET			0x2a8
+#define MT8192_TOP_AXI_PROT_EN_1_CLR			0x2ac
+#define MT8192_TOP_AXI_PROT_EN_MM_SET			0x2d4
+#define MT8192_TOP_AXI_PROT_EN_MM_CLR			0x2d8
+#define MT8192_TOP_AXI_PROT_EN_MM_STA1			0x2ec
+#define MT8192_TOP_AXI_PROT_EN_2_SET			0x714
+#define MT8192_TOP_AXI_PROT_EN_2_CLR			0x718
+#define MT8192_TOP_AXI_PROT_EN_2_STA1			0x724
+#define MT8192_TOP_AXI_PROT_EN_VDNR_SET			0xb84
+#define MT8192_TOP_AXI_PROT_EN_VDNR_CLR			0xb88
+#define MT8192_TOP_AXI_PROT_EN_VDNR_STA1		0xb90
+#define MT8192_TOP_AXI_PROT_EN_MM_2_SET			0xdcc
+#define MT8192_TOP_AXI_PROT_EN_MM_2_CLR			0xdd0
+#define MT8192_TOP_AXI_PROT_EN_MM_2_STA1		0xdd8
+
+#define MT8192_TOP_AXI_PROT_EN_DISP			(BIT(6) | BIT(23))
+#define MT8192_TOP_AXI_PROT_EN_CONN			(BIT(13) | BIT(18))
+#define MT8192_TOP_AXI_PROT_EN_CONN_2ND			BIT(14)
+#define MT8192_TOP_AXI_PROT_EN_MFG1			GENMASK(22, 21)
+#define MT8192_TOP_AXI_PROT_EN_1_CONN			BIT(10)
+#define MT8192_TOP_AXI_PROT_EN_1_MFG1			BIT(21)
+#define MT8192_TOP_AXI_PROT_EN_1_CAM			BIT(22)
+#define MT8192_TOP_AXI_PROT_EN_2_CAM			BIT(0)
+#define MT8192_TOP_AXI_PROT_EN_2_ADSP			BIT(3)
+#define MT8192_TOP_AXI_PROT_EN_2_AUDIO			BIT(4)
+#define MT8192_TOP_AXI_PROT_EN_2_MFG1			GENMASK(6, 5)
+#define MT8192_TOP_AXI_PROT_EN_2_MFG1_2ND		BIT(7)
+#define MT8192_TOP_AXI_PROT_EN_MM_CAM			(BIT(0) | BIT(2))
+#define MT8192_TOP_AXI_PROT_EN_MM_DISP			(BIT(0) | BIT(2) | \
+							BIT(10) | BIT(12) | \
+							BIT(14) | BIT(16) | \
+							BIT(24) | BIT(26))
+#define MT8192_TOP_AXI_PROT_EN_MM_CAM_2ND		(BIT(1) | BIT(3))
+#define MT8192_TOP_AXI_PROT_EN_MM_DISP_2ND		(BIT(1) | BIT(3) | \
+							BIT(15) | BIT(17) | \
+							BIT(25) | BIT(27))
+#define MT8192_TOP_AXI_PROT_EN_MM_ISP2			BIT(14)
+#define MT8192_TOP_AXI_PROT_EN_MM_ISP2_2ND		BIT(15)
+#define MT8192_TOP_AXI_PROT_EN_MM_IPE			BIT(16)
+#define MT8192_TOP_AXI_PROT_EN_MM_IPE_2ND		BIT(17)
+#define MT8192_TOP_AXI_PROT_EN_MM_VDEC			BIT(24)
+#define MT8192_TOP_AXI_PROT_EN_MM_VDEC_2ND		BIT(25)
+#define MT8192_TOP_AXI_PROT_EN_MM_VENC			BIT(26)
+#define MT8192_TOP_AXI_PROT_EN_MM_VENC_2ND		BIT(27)
+#define MT8192_TOP_AXI_PROT_EN_MM_2_ISP			BIT(8)
+#define MT8192_TOP_AXI_PROT_EN_MM_2_DISP		(BIT(8) | BIT(12))
+#define MT8192_TOP_AXI_PROT_EN_MM_2_ISP_2ND		BIT(9)
+#define MT8192_TOP_AXI_PROT_EN_MM_2_DISP_2ND		(BIT(9) | BIT(13))
+#define MT8192_TOP_AXI_PROT_EN_MM_2_MDP			BIT(12)
+#define MT8192_TOP_AXI_PROT_EN_MM_2_MDP_2ND		BIT(13)
+#define MT8192_TOP_AXI_PROT_EN_VDNR_CAM			BIT(21)
+
 #define MT8183_TOP_AXI_PROT_EN_STA1			0x228
 #define MT8183_TOP_AXI_PROT_EN_STA1_1			0x258
 #define MT8183_TOP_AXI_PROT_EN_SET			0x2a0
-- 
cgit v1.2.3


From 24c8a743336a1fdf42c0c768b4435633069c6a39 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@bootlin.com>
Date: Wed, 30 Sep 2020 20:48:02 +0200
Subject: pcmcia: at91_cf: move definitions locally

struct at91_cf_data is only used in the driver since all the platforms moved
to device tree, move its definition locally.

Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20200930184804.3127757-1-alexandre.belloni@bootlin.com
---
 drivers/pcmcia/at91_cf.c            | 12 +++++++++++-
 include/linux/platform_data/atmel.h | 12 ------------
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pcmcia/at91_cf.c b/drivers/pcmcia/at91_cf.c
index 7db0e9c74dfc..ed60c4dffecb 100644
--- a/drivers/pcmcia/at91_cf.c
+++ b/drivers/pcmcia/at91_cf.c
@@ -13,7 +13,6 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/gpio.h>
-#include <linux/platform_data/atmel.h>
 #include <linux/io.h>
 #include <linux/sizes.h>
 #include <linux/mfd/syscon.h>
@@ -35,6 +34,17 @@
 #define	CF_IO_PHYS	(1 << 23)
 #define	CF_MEM_PHYS	(0x017ff800)
 
+struct at91_cf_data {
+	int	irq_pin;		/* I/O IRQ */
+	int	det_pin;		/* Card detect */
+	int	vcc_pin;		/* power switching */
+	int	rst_pin;		/* card reset */
+	u8	chipselect;		/* EBI Chip Select number */
+	u8	flags;
+#define AT91_CF_TRUE_IDE	0x01
+#define AT91_IDE_SWAP_A0_A2	0x02
+};
+
 struct regmap *mc;
 
 /*--------------------------------------------------------------------------*/
diff --git a/include/linux/platform_data/atmel.h b/include/linux/platform_data/atmel.h
index 99e6069c5fd8..73f63be509c4 100644
--- a/include/linux/platform_data/atmel.h
+++ b/include/linux/platform_data/atmel.h
@@ -6,18 +6,6 @@
 #ifndef __ATMEL_H__
 #define __ATMEL_H__
 
- /* Compact Flash */
-struct at91_cf_data {
-	int	irq_pin;		/* I/O IRQ */
-	int	det_pin;		/* Card detect */
-	int	vcc_pin;		/* power switching */
-	int	rst_pin;		/* card reset */
-	u8	chipselect;		/* EBI Chip Select number */
-	u8	flags;
-#define AT91_CF_TRUE_IDE	0x01
-#define AT91_IDE_SWAP_A0_A2	0x02
-};
-
 /* FIXME: this needs a better location, but gets stuff building again */
 #ifdef CONFIG_ATMEL_PM
 extern int at91_suspend_entering_slow_clock(void);
-- 
cgit v1.2.3


From a69dcdfc2dd21f86cb1f79f98fc94c52f96cff64 Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Mon, 2 Nov 2020 08:04:38 +0800
Subject: soc / drm: mediatek: cmdq: Remove timeout handler in helper function

For each client driver, its timeout handler need to dump hardware register
or its state machine information, and their way to detect timeout are
also different, so remove timeout handler in helper function and
let client driver implement its own timeout handler.

Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Acked-by: Matthias Brugger <matthias.bgg@gmail.com>
Link: https://lore.kernel.org/r/20201102000438.29225-1-chunkuang.hu@kernel.org
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/gpu/drm/mediatek/mtk_drm_crtc.c |  3 +--
 drivers/soc/mediatek/mtk-cmdq-helper.c  | 41 +--------------------------------
 include/linux/soc/mediatek/mtk-cmdq.h   | 10 +-------
 3 files changed, 3 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
index ac038572164d..4be5d1fccf2e 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_crtc.c
@@ -824,8 +824,7 @@ int mtk_drm_crtc_create(struct drm_device *drm_dev,
 #if IS_REACHABLE(CONFIG_MTK_CMDQ)
 	mtk_crtc->cmdq_client =
 			cmdq_mbox_create(mtk_crtc->mmsys_dev,
-					 drm_crtc_index(&mtk_crtc->base),
-					 2000);
+					 drm_crtc_index(&mtk_crtc->base));
 	if (IS_ERR(mtk_crtc->cmdq_client)) {
 		dev_dbg(dev, "mtk_crtc %d failed to create mailbox client, writing register by CPU now\n",
 			drm_crtc_index(&mtk_crtc->base));
diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c b/drivers/soc/mediatek/mtk-cmdq-helper.c
index 505651b0d715..280d3bd9f675 100644
--- a/drivers/soc/mediatek/mtk-cmdq-helper.c
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -70,14 +70,7 @@ int cmdq_dev_get_client_reg(struct device *dev,
 }
 EXPORT_SYMBOL(cmdq_dev_get_client_reg);
 
-static void cmdq_client_timeout(struct timer_list *t)
-{
-	struct cmdq_client *client = from_timer(client, t, timer);
-
-	dev_err(client->client.dev, "cmdq timeout!\n");
-}
-
-struct cmdq_client *cmdq_mbox_create(struct device *dev, int index, u32 timeout)
+struct cmdq_client *cmdq_mbox_create(struct device *dev, int index)
 {
 	struct cmdq_client *client;
 
@@ -85,12 +78,6 @@ struct cmdq_client *cmdq_mbox_create(struct device *dev, int index, u32 timeout)
 	if (!client)
 		return (struct cmdq_client *)-ENOMEM;
 
-	client->timeout_ms = timeout;
-	if (timeout != CMDQ_NO_TIMEOUT) {
-		spin_lock_init(&client->lock);
-		timer_setup(&client->timer, cmdq_client_timeout, 0);
-	}
-	client->pkt_cnt = 0;
 	client->client.dev = dev;
 	client->client.tx_block = false;
 	client->client.knows_txdone = true;
@@ -112,11 +99,6 @@ EXPORT_SYMBOL(cmdq_mbox_create);
 
 void cmdq_mbox_destroy(struct cmdq_client *client)
 {
-	if (client->timeout_ms != CMDQ_NO_TIMEOUT) {
-		spin_lock(&client->lock);
-		del_timer_sync(&client->timer);
-		spin_unlock(&client->lock);
-	}
 	mbox_free_channel(client->chan);
 	kfree(client);
 }
@@ -449,18 +431,6 @@ static void cmdq_pkt_flush_async_cb(struct cmdq_cb_data data)
 	struct cmdq_task_cb *cb = &pkt->cb;
 	struct cmdq_client *client = (struct cmdq_client *)pkt->cl;
 
-	if (client->timeout_ms != CMDQ_NO_TIMEOUT) {
-		unsigned long flags = 0;
-
-		spin_lock_irqsave(&client->lock, flags);
-		if (--client->pkt_cnt == 0)
-			del_timer(&client->timer);
-		else
-			mod_timer(&client->timer, jiffies +
-				  msecs_to_jiffies(client->timeout_ms));
-		spin_unlock_irqrestore(&client->lock, flags);
-	}
-
 	dma_sync_single_for_cpu(client->chan->mbox->dev, pkt->pa_base,
 				pkt->cmd_buf_size, DMA_TO_DEVICE);
 	if (cb->cb) {
@@ -473,7 +443,6 @@ int cmdq_pkt_flush_async(struct cmdq_pkt *pkt, cmdq_async_flush_cb cb,
 			 void *data)
 {
 	int err;
-	unsigned long flags = 0;
 	struct cmdq_client *client = (struct cmdq_client *)pkt->cl;
 
 	pkt->cb.cb = cb;
@@ -484,14 +453,6 @@ int cmdq_pkt_flush_async(struct cmdq_pkt *pkt, cmdq_async_flush_cb cb,
 	dma_sync_single_for_device(client->chan->mbox->dev, pkt->pa_base,
 				   pkt->cmd_buf_size, DMA_TO_DEVICE);
 
-	if (client->timeout_ms != CMDQ_NO_TIMEOUT) {
-		spin_lock_irqsave(&client->lock, flags);
-		if (client->pkt_cnt++ == 0)
-			mod_timer(&client->timer, jiffies +
-				  msecs_to_jiffies(client->timeout_ms));
-		spin_unlock_irqrestore(&client->lock, flags);
-	}
-
 	err = mbox_send_message(client->chan, pkt);
 	if (err < 0)
 		return err;
diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index 960704d75994..8e9996610978 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -11,7 +11,6 @@
 #include <linux/mailbox/mtk-cmdq-mailbox.h>
 #include <linux/timer.h>
 
-#define CMDQ_NO_TIMEOUT		0xffffffffu
 #define CMDQ_ADDR_HIGH(addr)	((u32)(((addr) >> 16) & GENMASK(31, 0)))
 #define CMDQ_ADDR_LOW(addr)	((u16)(addr) | BIT(1))
 
@@ -24,12 +23,8 @@ struct cmdq_client_reg {
 };
 
 struct cmdq_client {
-	spinlock_t lock;
-	u32 pkt_cnt;
 	struct mbox_client client;
 	struct mbox_chan *chan;
-	struct timer_list timer;
-	u32 timeout_ms; /* in unit of microsecond */
 };
 
 /**
@@ -51,13 +46,10 @@ int cmdq_dev_get_client_reg(struct device *dev,
  * cmdq_mbox_create() - create CMDQ mailbox client and channel
  * @dev:	device of CMDQ mailbox client
  * @index:	index of CMDQ mailbox channel
- * @timeout:	timeout of a pkt execution by GCE, in unit of microsecond, set
- *		CMDQ_NO_TIMEOUT if a timer is not used.
  *
  * Return: CMDQ mailbox client pointer
  */
-struct cmdq_client *cmdq_mbox_create(struct device *dev, int index,
-				     u32 timeout);
+struct cmdq_client *cmdq_mbox_create(struct device *dev, int index);
 
 /**
  * cmdq_mbox_destroy() - destroy CMDQ mailbox client and channel
-- 
cgit v1.2.3


From 51c0e618b219c025ddaaf14baea8942cb7e2105b Mon Sep 17 00:00:00 2001
From: Yongqiang Niu <yongqiang.niu@mediatek.com>
Date: Tue, 6 Oct 2020 21:33:17 +0200
Subject: soc / drm: mediatek: Move DDP component defines into mtk-mmsys.h

MMSYS is the driver which controls the routing of these DDP components,
so the definition of the mtk_ddp_comp_id enum should be placed in mtk-mmsys.h

Signed-off-by: Yongqiang Niu <yongqiang.niu@mediatek.com>
Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Link: https://lore.kernel.org/r/20201006193320.405529-2-enric.balletbo@collabora.com
Signed-off-by: Matthias Brugger <matthias.bgg@gmail.com>
---
 drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.h | 34 +----------------------------
 drivers/soc/mediatek/mtk-mmsys.c            |  4 +---
 include/linux/soc/mediatek/mtk-mmsys.h      | 33 ++++++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.h b/drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.h
index 1d9e00b69462..5aa52b7afeec 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.h
+++ b/drivers/gpu/drm/mediatek/mtk_drm_ddp_comp.h
@@ -7,6 +7,7 @@
 #define MTK_DRM_DDP_COMP_H
 
 #include <linux/io.h>
+#include <linux/soc/mediatek/mtk-mmsys.h>
 
 struct device;
 struct device_node;
@@ -35,39 +36,6 @@ enum mtk_ddp_comp_type {
 	MTK_DDP_COMP_TYPE_MAX,
 };
 
-enum mtk_ddp_comp_id {
-	DDP_COMPONENT_AAL0,
-	DDP_COMPONENT_AAL1,
-	DDP_COMPONENT_BLS,
-	DDP_COMPONENT_CCORR,
-	DDP_COMPONENT_COLOR0,
-	DDP_COMPONENT_COLOR1,
-	DDP_COMPONENT_DITHER,
-	DDP_COMPONENT_DPI0,
-	DDP_COMPONENT_DPI1,
-	DDP_COMPONENT_DSI0,
-	DDP_COMPONENT_DSI1,
-	DDP_COMPONENT_DSI2,
-	DDP_COMPONENT_DSI3,
-	DDP_COMPONENT_GAMMA,
-	DDP_COMPONENT_OD0,
-	DDP_COMPONENT_OD1,
-	DDP_COMPONENT_OVL0,
-	DDP_COMPONENT_OVL_2L0,
-	DDP_COMPONENT_OVL_2L1,
-	DDP_COMPONENT_OVL1,
-	DDP_COMPONENT_PWM0,
-	DDP_COMPONENT_PWM1,
-	DDP_COMPONENT_PWM2,
-	DDP_COMPONENT_RDMA0,
-	DDP_COMPONENT_RDMA1,
-	DDP_COMPONENT_RDMA2,
-	DDP_COMPONENT_UFOE,
-	DDP_COMPONENT_WDMA0,
-	DDP_COMPONENT_WDMA1,
-	DDP_COMPONENT_ID_MAX,
-};
-
 struct mtk_ddp_comp;
 struct cmdq_pkt;
 struct mtk_ddp_comp_funcs {
diff --git a/drivers/soc/mediatek/mtk-mmsys.c b/drivers/soc/mediatek/mtk-mmsys.c
index a55f25511173..36ad66bb221b 100644
--- a/drivers/soc/mediatek/mtk-mmsys.c
+++ b/drivers/soc/mediatek/mtk-mmsys.c
@@ -5,13 +5,11 @@
  */
 
 #include <linux/device.h>
+#include <linux/io.h>
 #include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/soc/mediatek/mtk-mmsys.h>
 
-#include "../../gpu/drm/mediatek/mtk_drm_ddp.h"
-#include "../../gpu/drm/mediatek/mtk_drm_ddp_comp.h"
-
 #define DISP_REG_CONFIG_DISP_OVL0_MOUT_EN	0x040
 #define DISP_REG_CONFIG_DISP_OVL1_MOUT_EN	0x044
 #define DISP_REG_CONFIG_DISP_OD_MOUT_EN		0x048
diff --git a/include/linux/soc/mediatek/mtk-mmsys.h b/include/linux/soc/mediatek/mtk-mmsys.h
index 7bab5d9a3d31..2228bf6133da 100644
--- a/include/linux/soc/mediatek/mtk-mmsys.h
+++ b/include/linux/soc/mediatek/mtk-mmsys.h
@@ -9,6 +9,39 @@
 enum mtk_ddp_comp_id;
 struct device;
 
+enum mtk_ddp_comp_id {
+	DDP_COMPONENT_AAL0,
+	DDP_COMPONENT_AAL1,
+	DDP_COMPONENT_BLS,
+	DDP_COMPONENT_CCORR,
+	DDP_COMPONENT_COLOR0,
+	DDP_COMPONENT_COLOR1,
+	DDP_COMPONENT_DITHER,
+	DDP_COMPONENT_DPI0,
+	DDP_COMPONENT_DPI1,
+	DDP_COMPONENT_DSI0,
+	DDP_COMPONENT_DSI1,
+	DDP_COMPONENT_DSI2,
+	DDP_COMPONENT_DSI3,
+	DDP_COMPONENT_GAMMA,
+	DDP_COMPONENT_OD0,
+	DDP_COMPONENT_OD1,
+	DDP_COMPONENT_OVL0,
+	DDP_COMPONENT_OVL_2L0,
+	DDP_COMPONENT_OVL_2L1,
+	DDP_COMPONENT_OVL1,
+	DDP_COMPONENT_PWM0,
+	DDP_COMPONENT_PWM1,
+	DDP_COMPONENT_PWM2,
+	DDP_COMPONENT_RDMA0,
+	DDP_COMPONENT_RDMA1,
+	DDP_COMPONENT_RDMA2,
+	DDP_COMPONENT_UFOE,
+	DDP_COMPONENT_WDMA0,
+	DDP_COMPONENT_WDMA1,
+	DDP_COMPONENT_ID_MAX,
+};
+
 void mtk_mmsys_ddp_connect(struct device *dev,
 			   enum mtk_ddp_comp_id cur,
 			   enum mtk_ddp_comp_id next);
-- 
cgit v1.2.3


From 7776bcd241e08e13ef009926c6dea84dc3b2f8ff Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 3 Nov 2020 00:48:44 +0100
Subject: power: supply: s3c-adc-battery: Convert to GPIO descriptors

This converts the S3C ADC battery to use GPIO descriptors
instead of a global GPIO number for the charging completed
GPIO. Using the pattern from the GPIO charger we name this
GPIO line "charge-status" in the board file.

Cc: linux-samsung-soc@vger.kernel.org
Cc: Sergiy Kibrik <sakib@darkstar.site>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 arch/arm/mach-s3c/mach-h1940.c         | 12 +++++--
 arch/arm/mach-s3c/mach-rx1950.c        | 11 ++++++-
 drivers/power/supply/s3c_adc_battery.c | 57 +++++++++++++++++-----------------
 include/linux/s3c_adc_battery.h        |  3 --
 4 files changed, 49 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-s3c/mach-h1940.c b/arch/arm/mach-s3c/mach-h1940.c
index 53d51aa83200..8a43ed1c4c4d 100644
--- a/arch/arm/mach-s3c/mach-h1940.c
+++ b/arch/arm/mach-s3c/mach-h1940.c
@@ -297,6 +297,15 @@ static const struct s3c_adc_bat_thresh bat_lut_acin[] = {
 	{ .volt = 3841, .cur = 0, .level = 0},
 };
 
+static struct gpiod_lookup_table h1940_bat_gpio_table = {
+	.dev_id = "s3c-adc-battery",
+	.table = {
+		/* Charge status S3C2410_GPF(3) */
+		GPIO_LOOKUP("GPIOF", 3, "charge-status", GPIO_ACTIVE_LOW),
+		{ },
+	},
+};
+
 static int h1940_bat_init(void)
 {
 	int ret;
@@ -330,8 +339,6 @@ static struct s3c_adc_bat_pdata h1940_bat_cfg = {
 	.exit = h1940_bat_exit,
 	.enable_charger = h1940_enable_charger,
 	.disable_charger = h1940_disable_charger,
-	.gpio_charge_finished = S3C2410_GPF(3),
-	.gpio_inverted = 1,
 	.lut_noac = bat_lut_noac,
 	.lut_noac_cnt = ARRAY_SIZE(bat_lut_noac),
 	.lut_acin = bat_lut_acin,
@@ -720,6 +727,7 @@ static void __init h1940_init(void)
 	s3c24xx_fb_set_platdata(&h1940_fb_info);
 	gpiod_add_lookup_table(&h1940_mmc_gpio_table);
 	gpiod_add_lookup_table(&h1940_audio_gpio_table);
+	gpiod_add_lookup_table(&h1940_bat_gpio_table);
 	/* Configure the I2S pins (GPE0...GPE4) in correct mode */
 	s3c_gpio_cfgall_range(S3C2410_GPE(0), 5, S3C_GPIO_SFN(2),
 			      S3C_GPIO_PULL_NONE);
diff --git a/arch/arm/mach-s3c/mach-rx1950.c b/arch/arm/mach-s3c/mach-rx1950.c
index b9758f0a9a14..6e19add158a9 100644
--- a/arch/arm/mach-s3c/mach-rx1950.c
+++ b/arch/arm/mach-s3c/mach-rx1950.c
@@ -206,6 +206,15 @@ static const struct s3c_adc_bat_thresh bat_lut_acin[] = {
 	{ .volt = 3820, .cur = 0, .level = 0},
 };
 
+static struct gpiod_lookup_table rx1950_bat_gpio_table = {
+	.dev_id = "s3c-adc-battery",
+	.table = {
+		/* Charge status S3C2410_GPF(3) */
+		GPIO_LOOKUP("GPIOF", 3, "charge-status", GPIO_ACTIVE_HIGH),
+		{ },
+	},
+};
+
 static int rx1950_bat_init(void)
 {
 	int ret;
@@ -331,7 +340,6 @@ static struct s3c_adc_bat_pdata rx1950_bat_cfg = {
 	.exit = rx1950_bat_exit,
 	.enable_charger = rx1950_enable_charger,
 	.disable_charger = rx1950_disable_charger,
-	.gpio_charge_finished = S3C2410_GPF(3),
 	.lut_noac = bat_lut_noac,
 	.lut_noac_cnt = ARRAY_SIZE(bat_lut_noac),
 	.lut_acin = bat_lut_acin,
@@ -840,6 +848,7 @@ static void __init rx1950_init_machine(void)
 
 	pwm_add_table(rx1950_pwm_lookup, ARRAY_SIZE(rx1950_pwm_lookup));
 	gpiod_add_lookup_table(&rx1950_audio_gpio_table);
+	gpiod_add_lookup_table(&rx1950_bat_gpio_table);
 	/* Configure the I2S pins (GPE0...GPE4) in correct mode */
 	s3c_gpio_cfgall_range(S3C2410_GPE(0), 5, S3C_GPIO_SFN(2),
 			      S3C_GPIO_PULL_NONE);
diff --git a/drivers/power/supply/s3c_adc_battery.c b/drivers/power/supply/s3c_adc_battery.c
index 60b7f41ab063..a2addc24ee8b 100644
--- a/drivers/power/supply/s3c_adc_battery.c
+++ b/drivers/power/supply/s3c_adc_battery.c
@@ -13,7 +13,7 @@
 #include <linux/platform_device.h>
 #include <linux/power_supply.h>
 #include <linux/leds.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/err.h>
 #include <linux/timer.h>
 #include <linux/jiffies.h>
@@ -31,6 +31,7 @@ struct s3c_adc_bat {
 	struct power_supply		*psy;
 	struct s3c_adc_client		*client;
 	struct s3c_adc_bat_pdata	*pdata;
+	struct gpio_desc		*charge_finished;
 	int				volt_value;
 	int				cur_value;
 	unsigned int			timestamp;
@@ -132,9 +133,7 @@ static int calc_full_volt(int volt_val, int cur_val, int impedance)
 
 static int charge_finished(struct s3c_adc_bat *bat)
 {
-	return bat->pdata->gpio_inverted ?
-		!gpio_get_value(bat->pdata->gpio_charge_finished) :
-		gpio_get_value(bat->pdata->gpio_charge_finished);
+	return gpiod_get_value(bat->charge_finished);
 }
 
 static int s3c_adc_bat_get_property(struct power_supply *psy,
@@ -169,7 +168,7 @@ static int s3c_adc_bat_get_property(struct power_supply *psy,
 	}
 
 	if (bat->cable_plugged &&
-		((bat->pdata->gpio_charge_finished < 0) ||
+		(!bat->charge_finished ||
 		!charge_finished(bat))) {
 		lut = bat->pdata->lut_acin;
 		lut_size = bat->pdata->lut_acin_cnt;
@@ -206,7 +205,7 @@ static int s3c_adc_bat_get_property(struct power_supply *psy,
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
-		if (bat->pdata->gpio_charge_finished < 0)
+		if (!bat->charge_finished)
 			val->intval = bat->level == 100000 ?
 				POWER_SUPPLY_STATUS_FULL : bat->status;
 		else
@@ -265,7 +264,7 @@ static void s3c_adc_bat_work(struct work_struct *work)
 			bat->status = POWER_SUPPLY_STATUS_DISCHARGING;
 		}
 	} else {
-		if ((bat->pdata->gpio_charge_finished >= 0) && is_plugged) {
+		if (bat->charge_finished && is_plugged) {
 			is_charged = charge_finished(&main_bat);
 			if (is_charged) {
 				if (bat->pdata->disable_charger)
@@ -294,6 +293,7 @@ static int s3c_adc_bat_probe(struct platform_device *pdev)
 	struct s3c_adc_client	*client;
 	struct s3c_adc_bat_pdata *pdata = pdev->dev.platform_data;
 	struct power_supply_config psy_cfg = {};
+	struct gpio_desc *gpiod;
 	int ret;
 
 	client = s3c_adc_register(pdev, NULL, NULL, 0);
@@ -304,8 +304,17 @@ static int s3c_adc_bat_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, client);
 
+	gpiod = devm_gpiod_get_optional(&pdev->dev, "charge-status", GPIOD_IN);
+	if (IS_ERR(gpiod)) {
+		/* Could be probe deferral etc */
+		ret = PTR_ERR(gpiod);
+		dev_err(&pdev->dev, "no GPIO %d\n", ret);
+		return ret;
+	}
+
 	main_bat.client = client;
 	main_bat.pdata = pdata;
+	main_bat.charge_finished = gpiod;
 	main_bat.volt_value = -1;
 	main_bat.cur_value = -1;
 	main_bat.cable_plugged = 0;
@@ -323,6 +332,7 @@ static int s3c_adc_bat_probe(struct platform_device *pdev)
 
 		backup_bat.client = client;
 		backup_bat.pdata = pdev->dev.platform_data;
+		backup_bat.charge_finished = gpiod;
 		backup_bat.volt_value = -1;
 		backup_bat.psy = power_supply_register(&pdev->dev,
 						       &backup_bat_desc,
@@ -335,12 +345,8 @@ static int s3c_adc_bat_probe(struct platform_device *pdev)
 
 	INIT_DELAYED_WORK(&bat_work, s3c_adc_bat_work);
 
-	if (pdata->gpio_charge_finished >= 0) {
-		ret = gpio_request(pdata->gpio_charge_finished, "charged");
-		if (ret)
-			goto err_gpio;
-
-		ret = request_irq(gpio_to_irq(pdata->gpio_charge_finished),
+	if (gpiod) {
+		ret = request_irq(gpiod_to_irq(gpiod),
 				s3c_adc_bat_charged,
 				IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
 				"battery charged", NULL);
@@ -364,12 +370,9 @@ static int s3c_adc_bat_probe(struct platform_device *pdev)
 	return 0;
 
 err_platform:
-	if (pdata->gpio_charge_finished >= 0)
-		free_irq(gpio_to_irq(pdata->gpio_charge_finished), NULL);
+	if (gpiod)
+		free_irq(gpiod_to_irq(gpiod), NULL);
 err_irq:
-	if (pdata->gpio_charge_finished >= 0)
-		gpio_free(pdata->gpio_charge_finished);
-err_gpio:
 	if (pdata->backup_volt_mult)
 		power_supply_unregister(backup_bat.psy);
 err_reg_backup:
@@ -389,10 +392,8 @@ static int s3c_adc_bat_remove(struct platform_device *pdev)
 
 	s3c_adc_release(client);
 
-	if (pdata->gpio_charge_finished >= 0) {
-		free_irq(gpio_to_irq(pdata->gpio_charge_finished), NULL);
-		gpio_free(pdata->gpio_charge_finished);
-	}
+	if (main_bat.charge_finished)
+		free_irq(gpiod_to_irq(main_bat.charge_finished), NULL);
 
 	cancel_delayed_work(&bat_work);
 
@@ -408,12 +409,12 @@ static int s3c_adc_bat_suspend(struct platform_device *pdev,
 {
 	struct s3c_adc_bat_pdata *pdata = pdev->dev.platform_data;
 
-	if (pdata->gpio_charge_finished >= 0) {
+	if (main_bat.charge_finished) {
 		if (device_may_wakeup(&pdev->dev))
 			enable_irq_wake(
-				gpio_to_irq(pdata->gpio_charge_finished));
+				gpiod_to_irq(main_bat.charge_finished));
 		else {
-			disable_irq(gpio_to_irq(pdata->gpio_charge_finished));
+			disable_irq(gpiod_to_irq(main_bat.charge_finished));
 			main_bat.pdata->disable_charger();
 		}
 	}
@@ -425,12 +426,12 @@ static int s3c_adc_bat_resume(struct platform_device *pdev)
 {
 	struct s3c_adc_bat_pdata *pdata = pdev->dev.platform_data;
 
-	if (pdata->gpio_charge_finished >= 0) {
+	if (main_bat.charge_finished) {
 		if (device_may_wakeup(&pdev->dev))
 			disable_irq_wake(
-				gpio_to_irq(pdata->gpio_charge_finished));
+				gpiod_to_irq(main_bat.charge_finished));
 		else
-			enable_irq(gpio_to_irq(pdata->gpio_charge_finished));
+			enable_irq(gpiod_to_irq(main_bat.charge_finished));
 	}
 
 	/* Schedule timer to check current status */
diff --git a/include/linux/s3c_adc_battery.h b/include/linux/s3c_adc_battery.h
index 833871dcf6fd..57f982c375f8 100644
--- a/include/linux/s3c_adc_battery.h
+++ b/include/linux/s3c_adc_battery.h
@@ -14,9 +14,6 @@ struct s3c_adc_bat_pdata {
 	void (*enable_charger)(void);
 	void (*disable_charger)(void);
 
-	int gpio_charge_finished;
-	int gpio_inverted;
-
 	const struct s3c_adc_bat_thresh *lut_noac;
 	unsigned int lut_noac_cnt;
 	const struct s3c_adc_bat_thresh *lut_acin;
-- 
cgit v1.2.3


From b0327ffb133fb2148fc3bc2afb39af2871ab21cb Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 30 Oct 2020 13:24:24 +0100
Subject: power: supply: generic-adc-battery: Use GPIO descriptors

This driver uses platform data to pass GPIO lines using the
deprecated global GPIO numbers. There are no in-tree users
of this platform data.

Any out-of-tree or coming users of this driver can easily be
migrated to use machine descriptor tables as described in
Documentation/driver-api/gpio/board.rst
section "platform data".

Cc: Anish Kumar <anish198519851985@gmail.com>
Cc: H. Nikolaus Schaller <hns@goldelico.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/generic-adc-battery.c | 31 +++++++++++-------------------
 include/linux/power/generic-adc-battery.h  |  4 ----
 2 files changed, 11 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/supply/generic-adc-battery.c b/drivers/power/supply/generic-adc-battery.c
index caa829738ef7..0032069fbc2b 100644
--- a/drivers/power/supply/generic-adc-battery.c
+++ b/drivers/power/supply/generic-adc-battery.c
@@ -12,7 +12,7 @@
 #include <linux/interrupt.h>
 #include <linux/platform_device.h>
 #include <linux/power_supply.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/err.h>
 #include <linux/timer.h>
 #include <linux/jiffies.h>
@@ -52,6 +52,7 @@ struct gab {
 	int	level;
 	int	status;
 	bool cable_plugged;
+	struct gpio_desc *charge_finished;
 };
 
 static struct gab *to_generic_bat(struct power_supply *psy)
@@ -91,13 +92,9 @@ static const enum power_supply_property gab_dyn_props[] = {
 
 static bool gab_charge_finished(struct gab *adc_bat)
 {
-	struct gab_platform_data *pdata = adc_bat->pdata;
-	bool ret = gpio_get_value(pdata->gpio_charge_finished);
-	bool inv = pdata->gpio_inverted;
-
-	if (!gpio_is_valid(pdata->gpio_charge_finished))
+	if (!adc_bat->charge_finished)
 		return false;
-	return ret ^ inv;
+	return gpiod_get_value(adc_bat->charge_finished);
 }
 
 static int gab_get_status(struct gab *adc_bat)
@@ -327,18 +324,17 @@ static int gab_probe(struct platform_device *pdev)
 
 	INIT_DELAYED_WORK(&adc_bat->bat_work, gab_work);
 
-	if (gpio_is_valid(pdata->gpio_charge_finished)) {
+	adc_bat->charge_finished = devm_gpiod_get_optional(&pdev->dev,
+							   "charged", GPIOD_IN);
+	if (adc_bat->charge_finished) {
 		int irq;
-		ret = gpio_request(pdata->gpio_charge_finished, "charged");
-		if (ret)
-			goto gpio_req_fail;
 
-		irq = gpio_to_irq(pdata->gpio_charge_finished);
+		irq = gpiod_to_irq(adc_bat->charge_finished);
 		ret = request_any_context_irq(irq, gab_charged,
 				IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
 				"battery charged", adc_bat);
 		if (ret < 0)
-			goto err_gpio;
+			goto gpio_req_fail;
 	}
 
 	platform_set_drvdata(pdev, adc_bat);
@@ -348,8 +344,6 @@ static int gab_probe(struct platform_device *pdev)
 			msecs_to_jiffies(0));
 	return 0;
 
-err_gpio:
-	gpio_free(pdata->gpio_charge_finished);
 gpio_req_fail:
 	power_supply_unregister(adc_bat->psy);
 err_reg_fail:
@@ -367,14 +361,11 @@ static int gab_remove(struct platform_device *pdev)
 {
 	int chan;
 	struct gab *adc_bat = platform_get_drvdata(pdev);
-	struct gab_platform_data *pdata = adc_bat->pdata;
 
 	power_supply_unregister(adc_bat->psy);
 
-	if (gpio_is_valid(pdata->gpio_charge_finished)) {
-		free_irq(gpio_to_irq(pdata->gpio_charge_finished), adc_bat);
-		gpio_free(pdata->gpio_charge_finished);
-	}
+	if (adc_bat->charge_finished)
+		free_irq(gpiod_to_irq(adc_bat->charge_finished), adc_bat);
 
 	for (chan = 0; chan < ARRAY_SIZE(gab_chan_name); chan++) {
 		if (adc_bat->channel[chan])
diff --git a/include/linux/power/generic-adc-battery.h b/include/linux/power/generic-adc-battery.h
index 40f9c7628f7b..c68cbf34cd34 100644
--- a/include/linux/power/generic-adc-battery.h
+++ b/include/linux/power/generic-adc-battery.h
@@ -11,16 +11,12 @@
  * @battery_info:         recommended structure to specify static power supply
  *			   parameters
  * @cal_charge:           calculate charge level.
- * @gpio_charge_finished: gpio for the charger.
- * @gpio_inverted:        Should be 1 if the GPIO is active low otherwise 0
  * @jitter_delay:         delay required after the interrupt to check battery
  *			  status.Default set is 10ms.
  */
 struct gab_platform_data {
 	struct power_supply_info battery_info;
 	int	(*cal_charge)(long value);
-	int	gpio_charge_finished;
-	bool	gpio_inverted;
 	int     jitter_delay;
 };
 
-- 
cgit v1.2.3


From 57e3cebd022fbc035dcf190ac789fd2ffc747f5b Mon Sep 17 00:00:00 2001
From: Shenming Lu <lushenming@huawei.com>
Date: Sat, 28 Nov 2020 22:18:57 +0800
Subject: KVM: arm64: Delay the polling of the GICR_VPENDBASER.Dirty bit

In order to reduce the impact of the VPT parsing happening on the GIC,
we can split the vcpu reseidency in two phases:

- programming GICR_VPENDBASER: this still happens in vcpu_load()
- checking for the VPT parsing to be complete: this can happen
  on vcpu entry (in kvm_vgic_flush_hwstate())

This allows the GIC and the CPU to work in parallel, rewmoving some
of the entry overhead.

Suggested-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Shenming Lu <lushenming@huawei.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20201128141857.983-3-lushenming@huawei.com
---
 arch/arm64/kvm/vgic/vgic-v4.c      | 12 ++++++++++++
 arch/arm64/kvm/vgic/vgic.c         |  3 +++
 drivers/irqchip/irq-gic-v3-its.c   | 12 ++++++++----
 drivers/irqchip/irq-gic-v4.c       | 19 +++++++++++++++++++
 include/kvm/arm_vgic.h             |  1 +
 include/linux/irqchip/arm-gic-v4.h |  4 ++++
 6 files changed, 47 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index b5fa73c9fd35..66508b03094f 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -353,6 +353,18 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
 	return err;
 }
 
+void vgic_v4_commit(struct kvm_vcpu *vcpu)
+{
+	struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
+
+	/*
+	 * No need to wait for the vPE to be ready across a shallow guest
+	 * exit, as only a vcpu_put will invalidate it.
+	 */
+	if (!vpe->ready)
+		its_commit_vpe(vpe);
+}
+
 static struct vgic_its *vgic_get_its(struct kvm *kvm,
 				     struct kvm_kernel_irq_routing_entry *irq_entry)
 {
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index c3643b7f101b..1c597c9885fa 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -915,6 +915,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 
 	if (can_access_vgic_from_kernel())
 		vgic_restore_state(vcpu);
+
+	if (vgic_supports_direct_msis(vcpu->kvm))
+		vgic_v4_commit(vcpu);
 }
 
 void kvm_vgic_load(struct kvm_vcpu *vcpu)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 0fec31931e11..7db602434ac5 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3842,8 +3842,6 @@ static void its_vpe_schedule(struct its_vpe *vpe)
 	val |= vpe->idai ? GICR_VPENDBASER_IDAI : 0;
 	val |= GICR_VPENDBASER_Valid;
 	gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
-
-	its_wait_vpt_parse_complete();
 }
 
 static void its_vpe_deschedule(struct its_vpe *vpe)
@@ -3891,6 +3889,10 @@ static int its_vpe_set_vcpu_affinity(struct irq_data *d, void *vcpu_info)
 		its_vpe_deschedule(vpe);
 		return 0;
 
+	case COMMIT_VPE:
+		its_wait_vpt_parse_complete();
+		return 0;
+
 	case INVALL_VPE:
 		its_vpe_invall(vpe);
 		return 0;
@@ -4052,8 +4054,6 @@ static void its_vpe_4_1_schedule(struct its_vpe *vpe,
 	val |= FIELD_PREP(GICR_VPENDBASER_4_1_VPEID, vpe->vpe_id);
 
 	gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
-
-	its_wait_vpt_parse_complete();
 }
 
 static void its_vpe_4_1_deschedule(struct its_vpe *vpe,
@@ -4128,6 +4128,10 @@ static int its_vpe_4_1_set_vcpu_affinity(struct irq_data *d, void *vcpu_info)
 		its_vpe_4_1_deschedule(vpe, info);
 		return 0;
 
+	case COMMIT_VPE:
+		its_wait_vpt_parse_complete();
+		return 0;
+
 	case INVALL_VPE:
 		its_vpe_4_1_invall(vpe);
 		return 0;
diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c
index 0c18714ae13e..5d1dc9915272 100644
--- a/drivers/irqchip/irq-gic-v4.c
+++ b/drivers/irqchip/irq-gic-v4.c
@@ -232,6 +232,8 @@ int its_make_vpe_non_resident(struct its_vpe *vpe, bool db)
 	if (!ret)
 		vpe->resident = false;
 
+	vpe->ready = false;
+
 	return ret;
 }
 
@@ -258,6 +260,23 @@ int its_make_vpe_resident(struct its_vpe *vpe, bool g0en, bool g1en)
 	return ret;
 }
 
+int its_commit_vpe(struct its_vpe *vpe)
+{
+	struct its_cmd_info info = {
+		.cmd_type = COMMIT_VPE,
+	};
+	int ret;
+
+	WARN_ON(preemptible());
+
+	ret = its_send_vpe_cmd(vpe, &info);
+	if (!ret)
+		vpe->ready = true;
+
+	return ret;
+}
+
+
 int its_invall_vpe(struct its_vpe *vpe)
 {
 	struct its_cmd_info info = {
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index a8d8fdcd3723..3d74f1060bd1 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -402,6 +402,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int irq,
 				 struct kvm_kernel_irq_routing_entry *irq_entry);
 
 int vgic_v4_load(struct kvm_vcpu *vcpu);
+void vgic_v4_commit(struct kvm_vcpu *vcpu);
 int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db);
 
 #endif /* __KVM_ARM_VGIC_H */
diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h
index 6976b8331b60..943c3411ca10 100644
--- a/include/linux/irqchip/arm-gic-v4.h
+++ b/include/linux/irqchip/arm-gic-v4.h
@@ -39,6 +39,8 @@ struct its_vpe {
 	irq_hw_number_t		vpe_db_lpi;
 	/* VPE resident */
 	bool			resident;
+	/* VPT parse complete */
+	bool			ready;
 	union {
 		/* GICv4.0 implementations */
 		struct {
@@ -104,6 +106,7 @@ enum its_vcpu_info_cmd_type {
 	PROP_UPDATE_AND_INV_VLPI,
 	SCHEDULE_VPE,
 	DESCHEDULE_VPE,
+	COMMIT_VPE,
 	INVALL_VPE,
 	PROP_UPDATE_VSGI,
 };
@@ -129,6 +132,7 @@ int its_alloc_vcpu_irqs(struct its_vm *vm);
 void its_free_vcpu_irqs(struct its_vm *vm);
 int its_make_vpe_resident(struct its_vpe *vpe, bool g0en, bool g1en);
 int its_make_vpe_non_resident(struct its_vpe *vpe, bool db);
+int its_commit_vpe(struct its_vpe *vpe);
 int its_invall_vpe(struct its_vpe *vpe);
 int its_map_vlpi(int irq, struct its_vlpi_map *map);
 int its_get_vlpi(int irq, struct its_vlpi_map *map);
-- 
cgit v1.2.3


From 147ad605dc12c515c97136899ccb5c70e6c674e1 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 23 Nov 2020 11:23:16 +0100
Subject: init: use type alignment for kernel parameters

Specify type alignment for kernel parameters instead of sizeof(long).

The alignment attribute is used to prevent gcc from increasing the
alignment of objects with static extent as an optimisation, something
which would mess up the __setup array stride.

Using __alignof__(struct obs_kernel_param) rather than sizeof(long) is
preferred since it better indicates why it is there and doesn't break
should the type size or alignment change.

Note that on m68k the alignment of struct obs_kernel_param is actually
two and that adding a 1- or 2-byte field to the 12-byte struct would
cause a breakage with the current 4-byte alignment.

Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/init.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/init.h b/include/linux/init.h
index 7b53cb3092ee..e668832ef66a 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -255,7 +255,7 @@ struct obs_kernel_param {
 		__aligned(1) = str; 					\
 	static struct obs_kernel_param __setup_##unique_id		\
 		__used __section(".init.setup")				\
-		__attribute__((aligned((sizeof(long)))))		\
+		__aligned(__alignof__(struct obs_kernel_param))		\
 		= { __setup_str_##unique_id, fn, early }
 
 #define __setup(str, fn)						\
-- 
cgit v1.2.3


From 1967939462641d8b36bcb3fcf06d48e66cd67a4f Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 29 Nov 2020 04:33:35 +0900
Subject: Compiler Attributes: remove CONFIG_ENABLE_MUST_CHECK

Revert commit cebc04ba9aeb ("add CONFIG_ENABLE_MUST_CHECK").

A lot of warn_unused_result warnings existed in 2006, but until now
they have been fixed thanks to people doing allmodconfig tests.

Our goal is to always enable __must_check where appropriate, so this
CONFIG option is no longer needed.

I see a lot of defconfig (arch/*/configs/*_defconfig) files having:

    # CONFIG_ENABLE_MUST_CHECK is not set

I did not touch them for now since it would be a big churn. If arch
maintainers want to clean them up, please go ahead.

While I was here, I also moved __must_check to compiler_attributes.h
from compiler_types.h

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Jason A. Donenfeld <Jason@zx2c4.com>
Acked-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
[Moved addition in compiler_attributes.h to keep it sorted]
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 include/linux/compiler_attributes.h                 | 6 ++++++
 include/linux/compiler_types.h                      | 6 ------
 lib/Kconfig.debug                                   | 8 --------
 tools/testing/selftests/wireguard/qemu/debug.config | 1 -
 4 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index b2a3f4f641a7..ea5e04e75845 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -272,6 +272,12 @@
  */
 #define __used                          __attribute__((__used__))
 
+/*
+ *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-warn_005funused_005fresult-function-attribute
+ * clang: https://clang.llvm.org/docs/AttributeReference.html#nodiscard-warn-unused-result
+ */
+#define __must_check                    __attribute__((__warn_unused_result__))
+
 /*
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-weak-function-attribute
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-weak-variable-attribute
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index ac3fa37a84f9..7ef20d1a6c28 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -110,12 +110,6 @@ struct ftrace_likely_data {
 	unsigned long			constant;
 };
 
-#ifdef CONFIG_ENABLE_MUST_CHECK
-#define __must_check		__attribute__((__warn_unused_result__))
-#else
-#define __must_check
-#endif
-
 #if defined(CC_USING_HOTPATCH)
 #define notrace			__attribute__((hotpatch(0, 0)))
 #elif defined(CC_USING_PATCHABLE_FUNCTION_ENTRY)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c789b39ed527..cb8ef4fd0d02 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -286,14 +286,6 @@ config GDB_SCRIPTS
 
 endif # DEBUG_INFO
 
-config ENABLE_MUST_CHECK
-	bool "Enable __must_check logic"
-	default y
-	help
-	  Enable the __must_check logic in the kernel build.  Disable this to
-	  suppress the "warning: ignoring return value of 'foo', declared with
-	  attribute warn_unused_result" messages.
-
 config FRAME_WARN
 	int "Warn for stack frames larger than"
 	range 0 8192
diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config
index b50c2085c1ac..fe07d97df9fa 100644
--- a/tools/testing/selftests/wireguard/qemu/debug.config
+++ b/tools/testing/selftests/wireguard/qemu/debug.config
@@ -1,5 +1,4 @@
 CONFIG_LOCALVERSION="-debug"
-CONFIG_ENABLE_MUST_CHECK=y
 CONFIG_FRAME_POINTER=y
 CONFIG_STACK_VALIDATION=y
 CONFIG_DEBUG_KERNEL=y
-- 
cgit v1.2.3


From 93b8959a0a8cf1b1a493efee9e8328681e111862 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Sun, 1 Nov 2020 15:24:41 -0500
Subject: NFS: More readdir cleanups

Remove the redundant caching of the credential in struct
nfs_open_dir_context.
Pass the buffer size as an argument to nfs_readdir_xdr_filler().

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c           | 25 +++++++++++--------------
 include/linux/nfs_fs.h |  1 -
 2 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 438906dae083..bc366bd8e8f3 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -68,7 +68,7 @@ const struct address_space_operations nfs_dir_aops = {
 	.freepage = nfs_readdir_clear_array,
 };
 
-static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, const struct cred *cred)
+static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
 	struct nfs_open_dir_context *ctx;
@@ -78,7 +78,6 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
 		ctx->attr_gencount = nfsi->attr_gencount;
 		ctx->dir_cookie = 0;
 		ctx->dup_cookie = 0;
-		ctx->cred = get_cred(cred);
 		spin_lock(&dir->i_lock);
 		if (list_empty(&nfsi->open_files) &&
 		    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
@@ -96,7 +95,6 @@ static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_cont
 	spin_lock(&dir->i_lock);
 	list_del(&ctx->list);
 	spin_unlock(&dir->i_lock);
-	put_cred(ctx->cred);
 	kfree(ctx);
 }
 
@@ -113,7 +111,7 @@ nfs_opendir(struct inode *inode, struct file *filp)
 
 	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
 
-	ctx = alloc_nfs_open_dir_context(inode, current_cred());
+	ctx = alloc_nfs_open_dir_context(inode);
 	if (IS_ERR(ctx)) {
 		res = PTR_ERR(ctx);
 		goto out;
@@ -468,12 +466,12 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
 }
 
 /* Fill a page with xdr information before transferring to the cache page */
-static
-int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
-			struct nfs_entry *entry, struct file *file, struct inode *inode)
+static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
+				  u64 cookie, struct page **pages,
+				  size_t bufsize)
 {
-	struct nfs_open_dir_context *ctx = file->private_data;
-	const struct cred *cred = ctx->cred;
+	struct file *file = desc->file;
+	struct inode *inode = file_inode(file);
 	unsigned long	timestamp, gencount;
 	int		error;
 
@@ -481,8 +479,8 @@ int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
 	timestamp = jiffies;
 	gencount = nfs_inc_attr_generation_counter();
 	desc->dir_verifier = nfs_save_change_attribute(inode);
-	error = NFS_PROTO(inode)->readdir(file_dentry(file), cred, entry->cookie, pages,
-					  NFS_SERVER(inode)->dtsize, desc->plus);
+	error = NFS_PROTO(inode)->readdir(file_dentry(file), file->f_cred,
+					  cookie, pages, bufsize, desc->plus);
 	if (error < 0) {
 		/* We requested READDIRPLUS, but the server doesn't grok it */
 		if (error == -ENOTSUPP && desc->plus) {
@@ -764,7 +762,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 {
 	struct page **pages;
 	struct nfs_entry entry;
-	struct file	*file = desc->file;
 	size_t array_size;
 	size_t dtsize = NFS_SERVER(inode)->dtsize;
 	int status = -ENOMEM;
@@ -791,8 +788,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
 
 	do {
 		unsigned int pglen;
-		status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode);
-
+		status = nfs_readdir_xdr_filler(desc, entry.cookie,
+						pages, dtsize);
 		if (status < 0)
 			break;
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index a2c6455ea3fa..dd6b463dda80 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -88,7 +88,6 @@ struct nfs_open_context {
 
 struct nfs_open_dir_context {
 	struct list_head list;
-	const struct cred *cred;
 	unsigned long attr_gencount;
 	__u64 dir_cookie;
 	__u64 dup_cookie;
-- 
cgit v1.2.3


From 82e22a5e6245873779db1607d3b0fec6f9ca07d0 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 2 Nov 2020 17:34:23 -0500
Subject: NFS: Allow the NFS generic code to pass in a verifier to readdir

If we're ever going to allow support for servers that use the readdir
verifier, then that use needs to be managed by the middle layers as
those need to be able to reject cookies from other verifiers.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c            | 23 ++++++++++++++++++-----
 fs/nfs/nfs3proc.c       | 35 +++++++++++++++++------------------
 fs/nfs/nfs4proc.c       | 36 +++++++++++++++++-------------------
 fs/nfs/proc.c           | 18 +++++++++---------
 include/linux/nfs_xdr.h | 17 +++++++++++++++--
 5 files changed, 76 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b226f6f3ae96..3ee0668a9719 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -469,8 +469,20 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 				  u64 cookie, struct page **pages,
 				  size_t bufsize)
 {
-	struct file *file = desc->file;
-	struct inode *inode = file_inode(file);
+	struct inode *inode = file_inode(desc->file);
+	__be32 verf_res[2];
+	struct nfs_readdir_arg arg = {
+		.dentry = file_dentry(desc->file),
+		.cred = desc->file->f_cred,
+		.verf = NFS_I(inode)->cookieverf,
+		.cookie = cookie,
+		.pages = pages,
+		.page_len = bufsize,
+		.plus = desc->plus,
+	};
+	struct nfs_readdir_res res = {
+		.verf = verf_res,
+	};
 	unsigned long	timestamp, gencount;
 	int		error;
 
@@ -478,20 +490,21 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 	timestamp = jiffies;
 	gencount = nfs_inc_attr_generation_counter();
 	desc->dir_verifier = nfs_save_change_attribute(inode);
-	error = NFS_PROTO(inode)->readdir(file_dentry(file), file->f_cred,
-					  cookie, pages, bufsize, desc->plus);
+	error = NFS_PROTO(inode)->readdir(&arg, &res);
 	if (error < 0) {
 		/* We requested READDIRPLUS, but the server doesn't grok it */
 		if (error == -ENOTSUPP && desc->plus) {
 			NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
 			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
-			desc->plus = false;
+			desc->plus = arg.plus = false;
 			goto again;
 		}
 		goto error;
 	}
 	desc->timestamp = timestamp;
 	desc->gencount = gencount;
+	memcpy(NFS_I(inode)->cookieverf, res.verf,
+	       sizeof(NFS_I(inode)->cookieverf));
 error:
 	return error;
 }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 6b66b73a50eb..5c4e23abc345 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -662,37 +662,36 @@ out:
  * Also note that this implementation handles both plain readdir and
  * readdirplus.
  */
-static int
-nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		  u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int nfs3_proc_readdir(struct nfs_readdir_arg *nr_arg,
+			     struct nfs_readdir_res *nr_res)
 {
-	struct inode		*dir = d_inode(dentry);
-	__be32			*verf = NFS_I(dir)->cookieverf;
+	struct inode		*dir = d_inode(nr_arg->dentry);
 	struct nfs3_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
-		.cookie		= cookie,
-		.verf		= {verf[0], verf[1]},
-		.plus		= plus,
-		.count		= count,
-		.pages		= pages
+		.cookie		= nr_arg->cookie,
+		.plus		= nr_arg->plus,
+		.count		= nr_arg->page_len,
+		.pages		= nr_arg->pages
 	};
 	struct nfs3_readdirres	res = {
-		.verf		= verf,
-		.plus		= plus
+		.verf		= nr_res->verf,
+		.plus		= nr_arg->plus,
 	};
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs3_procedures[NFS3PROC_READDIR],
 		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
-		.rpc_cred	= cred,
+		.rpc_cred	= nr_arg->cred,
 	};
 	int status = -ENOMEM;
 
-	if (plus)
+	if (nr_arg->plus)
 		msg.rpc_proc = &nfs3_procedures[NFS3PROC_READDIRPLUS];
+	if (arg.cookie)
+		memcpy(arg.verf, nr_arg->verf, sizeof(arg.verf));
 
-	dprintk("NFS call  readdir%s %d\n",
-			plus? "plus" : "", (unsigned int) cookie);
+	dprintk("NFS call  readdir%s %llu\n", nr_arg->plus ? "plus" : "",
+		(unsigned long long)nr_arg->cookie);
 
 	res.dir_attr = nfs_alloc_fattr();
 	if (res.dir_attr == NULL)
@@ -705,8 +704,8 @@ nfs3_proc_readdir(struct dentry *dentry, const struct cred *cred,
 
 	nfs_free_fattr(res.dir_attr);
 out:
-	dprintk("NFS reply readdir%s: %d\n",
-			plus? "plus" : "", status);
+	dprintk("NFS reply readdir%s: %d\n", nr_arg->plus ? "plus" : "",
+		status);
 	return status;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 66f1f4a5c74c..adcaba68eaed 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4961,41 +4961,40 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 	return err;
 }
 
-static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg,
+			      struct nfs_readdir_res *nr_res)
 {
-	struct inode		*dir = d_inode(dentry);
+	struct inode		*dir = d_inode(nr_arg->dentry);
 	struct nfs_server	*server = NFS_SERVER(dir);
 	struct nfs4_readdir_arg args = {
 		.fh = NFS_FH(dir),
-		.pages = pages,
+		.pages = nr_arg->pages,
 		.pgbase = 0,
-		.count = count,
-		.plus = plus,
+		.count = nr_arg->page_len,
+		.plus = nr_arg->plus,
 	};
 	struct nfs4_readdir_res res;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READDIR],
 		.rpc_argp = &args,
 		.rpc_resp = &res,
-		.rpc_cred = cred,
+		.rpc_cred = nr_arg->cred,
 	};
 	int			status;
 
-	dprintk("%s: dentry = %pd2, cookie = %Lu\n", __func__,
-			dentry,
-			(unsigned long long)cookie);
+	dprintk("%s: dentry = %pd2, cookie = %llu\n", __func__,
+		nr_arg->dentry, (unsigned long long)nr_arg->cookie);
 	if (!(server->caps & NFS_CAP_SECURITY_LABEL))
 		args.bitmask = server->attr_bitmask_nl;
 	else
 		args.bitmask = server->attr_bitmask;
 
-	nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args);
+	nfs4_setup_readdir(nr_arg->cookie, nr_arg->verf, nr_arg->dentry, &args);
 	res.pgbase = args.pgbase;
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
 			&res.seq_res, 0);
 	if (status >= 0) {
-		memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE);
+		memcpy(nr_res->verf, res.verifier.data, NFS4_VERIFIER_SIZE);
 		status += args.pgbase;
 	}
 
@@ -5005,19 +5004,18 @@ static int _nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
 	return status;
 }
 
-static int nfs4_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int nfs4_proc_readdir(struct nfs_readdir_arg *arg,
+			     struct nfs_readdir_res *res)
 {
 	struct nfs4_exception exception = {
 		.interruptible = true,
 	};
 	int err;
 	do {
-		err = _nfs4_proc_readdir(dentry, cred, cookie,
-				pages, count, plus);
-		trace_nfs4_readdir(d_inode(dentry), err);
-		err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err,
-				&exception);
+		err = _nfs4_proc_readdir(arg, res);
+		trace_nfs4_readdir(d_inode(arg->dentry), err);
+		err = nfs4_handle_exception(NFS_SERVER(d_inode(arg->dentry)),
+					    err, &exception);
 	} while (exception.retry);
 	return err;
 }
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 15c865cc837f..73ab7c59d3a7 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -499,26 +499,26 @@ nfs_proc_rmdir(struct inode *dir, const struct qstr *name)
  * sure it is syntactically correct; the entries itself are decoded
  * from nfs_readdir by calling the decode_entry function directly.
  */
-static int
-nfs_proc_readdir(struct dentry *dentry, const struct cred *cred,
-		 u64 cookie, struct page **pages, unsigned int count, bool plus)
+static int nfs_proc_readdir(struct nfs_readdir_arg *nr_arg,
+			    struct nfs_readdir_res *nr_res)
 {
-	struct inode		*dir = d_inode(dentry);
+	struct inode		*dir = d_inode(nr_arg->dentry);
 	struct nfs_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
-		.cookie		= cookie,
-		.count		= count,
-		.pages		= pages,
+		.cookie		= nr_arg->cookie,
+		.count		= nr_arg->page_len,
+		.pages		= nr_arg->pages,
 	};
 	struct rpc_message	msg = {
 		.rpc_proc	= &nfs_procedures[NFSPROC_READDIR],
 		.rpc_argp	= &arg,
-		.rpc_cred	= cred,
+		.rpc_cred	= nr_arg->cred,
 	};
 	int			status;
 
-	dprintk("NFS call  readdir %d\n", (unsigned int)cookie);
+	dprintk("NFS call  readdir %llu\n", (unsigned long long)nr_arg->cookie);
 	status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
+	nr_res->verf[0] = nr_res->verf[1] = 0;
 
 	nfs_invalidate_atime(dir);
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index d63cb862d58e..3327239fa2f9 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -750,6 +750,20 @@ struct nfs_entry {
 	struct nfs_server *	server;
 };
 
+struct nfs_readdir_arg {
+	struct dentry		*dentry;
+	const struct cred	*cred;
+	__be32			*verf;
+	u64			cookie;
+	struct page		**pages;
+	unsigned int		page_len;
+	bool			plus;
+};
+
+struct nfs_readdir_res {
+	__be32			*verf;
+};
+
 /*
  * The following types are for NFSv2 only.
  */
@@ -1744,8 +1758,7 @@ struct nfs_rpc_ops {
 			    unsigned int, struct iattr *);
 	int	(*mkdir)   (struct inode *, struct dentry *, struct iattr *);
 	int	(*rmdir)   (struct inode *, const struct qstr *);
-	int	(*readdir) (struct dentry *, const struct cred *,
-			    u64, struct page **, unsigned int, bool);
+	int	(*readdir) (struct nfs_readdir_arg *, struct nfs_readdir_res *);
 	int	(*mknod)   (struct inode *, struct dentry *, struct iattr *,
 			    dev_t);
 	int	(*statfs)  (struct nfs_server *, struct nfs_fh *,
-- 
cgit v1.2.3


From b593c09f83a2732a0f0298c8f3468236a83cdd9f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 2 Nov 2020 20:06:12 -0500
Subject: NFS: Improve handling of directory verifiers

If the server insists on using the readdir verifiers in order to allow
cookies to expire, then we should ensure that we cache the verifier
with the cookie, so that we can return an error if the application
tries to use the expired cookie.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Benjamin Coddington <bcodding@redhat.com>
Tested-by: Dave Wysochanski <dwysocha@redhat.com>
---
 fs/nfs/dir.c           | 35 +++++++++++++++++++++++------------
 fs/nfs/inode.c         |  7 -------
 include/linux/nfs_fs.h |  8 +++++++-
 3 files changed, 30 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3b44bef3a1b4..454377228167 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -155,6 +155,7 @@ struct nfs_readdir_descriptor {
 	loff_t		current_index;
 	loff_t		prev_index;
 
+	__be32		verf[NFS_DIR_VERIFIER_SIZE];
 	unsigned long	dir_verifier;
 	unsigned long	timestamp;
 	unsigned long	gencount;
@@ -466,15 +467,15 @@ static int nfs_readdir_search_array(struct nfs_readdir_descriptor *desc)
 
 /* Fill a page with xdr information before transferring to the cache page */
 static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
-				  u64 cookie, struct page **pages,
-				  size_t bufsize)
+				  __be32 *verf, u64 cookie,
+				  struct page **pages, size_t bufsize,
+				  __be32 *verf_res)
 {
 	struct inode *inode = file_inode(desc->file);
-	__be32 verf_res[2];
 	struct nfs_readdir_arg arg = {
 		.dentry = file_dentry(desc->file),
 		.cred = desc->file->f_cred,
-		.verf = NFS_I(inode)->cookieverf,
+		.verf = verf,
 		.cookie = cookie,
 		.pages = pages,
 		.page_len = bufsize,
@@ -503,8 +504,6 @@ static int nfs_readdir_xdr_filler(struct nfs_readdir_descriptor *desc,
 	}
 	desc->timestamp = timestamp;
 	desc->gencount = gencount;
-	memcpy(NFS_I(inode)->cookieverf, res.verf,
-	       sizeof(NFS_I(inode)->cookieverf));
 error:
 	return error;
 }
@@ -770,11 +769,13 @@ out_freepages:
 }
 
 static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
-				    struct page *page, struct inode *inode)
+				    struct page *page, __be32 *verf_arg,
+				    __be32 *verf_res)
 {
 	struct page **pages;
 	struct nfs_entry *entry;
 	size_t array_size;
+	struct inode *inode = file_inode(desc->file);
 	size_t dtsize = NFS_SERVER(inode)->dtsize;
 	int status = -ENOMEM;
 
@@ -801,8 +802,9 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
 
 	do {
 		unsigned int pglen;
-		status = nfs_readdir_xdr_filler(desc, entry->cookie,
-						pages, dtsize);
+		status = nfs_readdir_xdr_filler(desc, verf_arg, entry->cookie,
+						pages, dtsize,
+						verf_res);
 		if (status < 0)
 			break;
 
@@ -854,13 +856,15 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 {
 	struct inode *inode = file_inode(desc->file);
 	struct nfs_inode *nfsi = NFS_I(inode);
+	__be32 verf[NFS_DIR_VERIFIER_SIZE];
 	int res;
 
 	desc->page = nfs_readdir_page_get_cached(desc);
 	if (!desc->page)
 		return -ENOMEM;
 	if (nfs_readdir_page_needs_filling(desc->page)) {
-		res = nfs_readdir_xdr_to_array(desc, desc->page, inode);
+		res = nfs_readdir_xdr_to_array(desc, desc->page,
+					       nfsi->cookieverf, verf);
 		if (res < 0) {
 			nfs_readdir_page_unlock_and_put_cached(desc);
 			if (res == -EBADCOOKIE || res == -ENOTSYNC) {
@@ -870,6 +874,7 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
 			}
 			return res;
 		}
+		memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf));
 	}
 	res = nfs_readdir_search_array(desc);
 	if (res == 0) {
@@ -902,6 +907,7 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc)
 static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 {
 	struct file	*file = desc->file;
+	struct nfs_inode *nfsi = NFS_I(file_inode(file));
 	struct nfs_cache_array *array;
 	unsigned int i = 0;
 
@@ -915,6 +921,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 			desc->eof = true;
 			break;
 		}
+		memcpy(desc->verf, nfsi->cookieverf, sizeof(desc->verf));
 		if (i < (array->size-1))
 			desc->dir_cookie = array->array[i+1].cookie;
 		else
@@ -949,8 +956,8 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc)
 static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 {
 	struct page	*page = NULL;
+	__be32		verf[NFS_DIR_VERIFIER_SIZE];
 	int		status;
-	struct inode *inode = file_inode(desc->file);
 
 	dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
 			(unsigned long long)desc->dir_cookie);
@@ -967,7 +974,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
 	desc->duped = 0;
 
 	nfs_readdir_page_init_array(page, desc->dir_cookie);
-	status = nfs_readdir_xdr_to_array(desc, page, inode);
+	status = nfs_readdir_xdr_to_array(desc, page, desc->verf, verf);
 	if (status < 0)
 		goto out_release;
 
@@ -1023,6 +1030,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	desc->dup_cookie = dir_ctx->dup_cookie;
 	desc->duped = dir_ctx->duped;
 	desc->attr_gencount = dir_ctx->attr_gencount;
+	memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
 	spin_unlock(&file->f_lock);
 
 	do {
@@ -1061,6 +1069,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	dir_ctx->dup_cookie = desc->dup_cookie;
 	dir_ctx->duped = desc->duped;
 	dir_ctx->attr_gencount = desc->attr_gencount;
+	memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
 	spin_unlock(&file->f_lock);
 
 	kfree(desc);
@@ -1101,6 +1110,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 			dir_ctx->dir_cookie = offset;
 		else
 			dir_ctx->dir_cookie = 0;
+		if (offset == 0)
+			memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
 		dir_ctx->duped = 0;
 	}
 	spin_unlock(&filp->f_lock);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index aa6493905bbe..9b765a900b28 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -229,7 +229,6 @@ static void nfs_zap_caches_locked(struct inode *inode)
 	nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 	nfsi->attrtimeo_timestamp = jiffies;
 
-	memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
 		nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
 					| NFS_INO_INVALID_DATA
@@ -1237,7 +1236,6 @@ EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
 
 static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
-	struct nfs_inode *nfsi = NFS_I(inode);
 	int ret;
 
 	if (mapping->nrpages != 0) {
@@ -1250,11 +1248,6 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 		if (ret < 0)
 			return ret;
 	}
-	if (S_ISDIR(inode->i_mode)) {
-		spin_lock(&inode->i_lock);
-		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
-		spin_unlock(&inode->i_lock);
-	}
 	nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
 	nfs_fscache_wait_on_invalidate(inode);
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index dd6b463dda80..681ed98e4ba8 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -45,6 +45,11 @@
  */
 #define NFS_RPC_SWAPFLAGS		(RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS)
 
+/*
+ * Size of the NFS directory verifier
+ */
+#define NFS_DIR_VERIFIER_SIZE		2
+
 /*
  * NFSv3/v4 Access mode cache entry
  */
@@ -89,6 +94,7 @@ struct nfs_open_context {
 struct nfs_open_dir_context {
 	struct list_head list;
 	unsigned long attr_gencount;
+	__be32	verf[NFS_DIR_VERIFIER_SIZE];
 	__u64 dir_cookie;
 	__u64 dup_cookie;
 	signed char duped;
@@ -156,7 +162,7 @@ struct nfs_inode {
 	 * This is the cookie verifier used for NFSv3 readdir
 	 * operations
 	 */
-	__be32			cookieverf[2];
+	__be32			cookieverf[NFS_DIR_VERIFIER_SIZE];
 
 	atomic_long_t		nrequests;
 	struct nfs_mds_commit_info commit_info;
-- 
cgit v1.2.3


From d5aa6b22e2258f05317313ecc02efbb988ed6d38 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 6 Nov 2020 16:33:38 -0500
Subject: SUNRPC: xprt_load_transport() needs to support the netid "rdma6"

According to RFC5666, the correct netid for an IPv6 addressed RDMA
transport is "rdma6", which we've supported as a mount option since
Linux-4.7. The problem is when we try to load the module "xprtrdma6",
that will fail, since there is no modulealias of that name.

Fixes: 181342c5ebe8 ("xprtrdma: Add rdma6 option to support NFS/RDMA IPv6")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h     |  1 +
 net/sunrpc/xprt.c               | 65 +++++++++++++++++++++++++++++++----------
 net/sunrpc/xprtrdma/module.c    |  1 +
 net/sunrpc/xprtrdma/transport.c |  1 +
 net/sunrpc/xprtsock.c           |  4 +++
 5 files changed, 56 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a603d48d2b2c..3ac5037d1c3d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -330,6 +330,7 @@ struct xprt_class {
 	struct rpc_xprt *	(*setup)(struct xprt_create *);
 	struct module		*owner;
 	char			name[32];
+	const char *		netid[];
 };
 
 /*
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index f6c17e75f20e..57f09ea3ef2a 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -151,31 +151,64 @@ out:
 }
 EXPORT_SYMBOL_GPL(xprt_unregister_transport);
 
+static void
+xprt_class_release(const struct xprt_class *t)
+{
+	module_put(t->owner);
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid_locked(const char *netid)
+{
+	const struct xprt_class *t;
+	unsigned int i;
+
+	list_for_each_entry(t, &xprt_list, list) {
+		for (i = 0; t->netid[i][0] != '\0'; i++) {
+			if (strcmp(t->netid[i], netid) != 0)
+				continue;
+			if (!try_module_get(t->owner))
+				continue;
+			return t;
+		}
+	}
+	return NULL;
+}
+
+static const struct xprt_class *
+xprt_class_find_by_netid(const char *netid)
+{
+	const struct xprt_class *t;
+
+	spin_lock(&xprt_list_lock);
+	t = xprt_class_find_by_netid_locked(netid);
+	if (!t) {
+		spin_unlock(&xprt_list_lock);
+		request_module("rpc%s", netid);
+		spin_lock(&xprt_list_lock);
+		t = xprt_class_find_by_netid_locked(netid);
+	}
+	spin_unlock(&xprt_list_lock);
+	return t;
+}
+
 /**
  * xprt_load_transport - load a transport implementation
- * @transport_name: transport to load
+ * @netid: transport to load
  *
  * Returns:
  * 0:		transport successfully loaded
  * -ENOENT:	transport module not available
  */
-int xprt_load_transport(const char *transport_name)
+int xprt_load_transport(const char *netid)
 {
-	struct xprt_class *t;
-	int result;
+	const struct xprt_class *t;
 
-	result = 0;
-	spin_lock(&xprt_list_lock);
-	list_for_each_entry(t, &xprt_list, list) {
-		if (strcmp(t->name, transport_name) == 0) {
-			spin_unlock(&xprt_list_lock);
-			goto out;
-		}
-	}
-	spin_unlock(&xprt_list_lock);
-	result = request_module("xprt%s", transport_name);
-out:
-	return result;
+	t = xprt_class_find_by_netid(netid);
+	if (!t)
+		return -ENOENT;
+	xprt_class_release(t);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(xprt_load_transport);
 
diff --git a/net/sunrpc/xprtrdma/module.c b/net/sunrpc/xprtrdma/module.c
index 620327c01302..45c5b41ac8dc 100644
--- a/net/sunrpc/xprtrdma/module.c
+++ b/net/sunrpc/xprtrdma/module.c
@@ -24,6 +24,7 @@ MODULE_DESCRIPTION("RPC/RDMA Transport");
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_ALIAS("svcrdma");
 MODULE_ALIAS("xprtrdma");
+MODULE_ALIAS("rpcrdma6");
 
 static void __exit rpc_rdma_cleanup(void)
 {
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 8915e42240d3..035060c05fd5 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -768,6 +768,7 @@ static struct xprt_class xprt_rdma = {
 	.owner			= THIS_MODULE,
 	.ident			= XPRT_TRANSPORT_RDMA,
 	.setup			= xprt_setup_rdma,
+	.netid			= { "rdma", "rdma6", "" },
 };
 
 void xprt_rdma_cleanup(void)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 7090bbee0ec5..c93ff70da3f9 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -3059,6 +3059,7 @@ static struct xprt_class	xs_local_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_LOCAL,
 	.setup		= xs_setup_local,
+	.netid		= { "" },
 };
 
 static struct xprt_class	xs_udp_transport = {
@@ -3067,6 +3068,7 @@ static struct xprt_class	xs_udp_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_UDP,
 	.setup		= xs_setup_udp,
+	.netid		= { "udp", "udp6", "" },
 };
 
 static struct xprt_class	xs_tcp_transport = {
@@ -3075,6 +3077,7 @@ static struct xprt_class	xs_tcp_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_TCP,
 	.setup		= xs_setup_tcp,
+	.netid		= { "tcp", "tcp6", "" },
 };
 
 static struct xprt_class	xs_bc_tcp_transport = {
@@ -3083,6 +3086,7 @@ static struct xprt_class	xs_bc_tcp_transport = {
 	.owner		= THIS_MODULE,
 	.ident		= XPRT_TRANSPORT_BC_TCP,
 	.setup		= xs_setup_bc_tcp,
+	.netid		= { "" },
 };
 
 /**
-- 
cgit v1.2.3


From 1fc5f13186440973e1aa1d85aa263326756af431 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 10 Nov 2020 09:41:21 -0500
Subject: SUNRPC: Add a helper to return the transport identifier given a netid

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  1 +
 net/sunrpc/xprt.c           | 25 +++++++++++++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 3ac5037d1c3d..f7b75c72f80e 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -386,6 +386,7 @@ xprt_disable_swap(struct rpc_xprt *xprt)
 int			xprt_register_transport(struct xprt_class *type);
 int			xprt_unregister_transport(struct xprt_class *type);
 int			xprt_load_transport(const char *);
+int			xprt_find_transport_ident(const char *);
 void			xprt_wait_for_reply_request_def(struct rpc_task *task);
 void			xprt_wait_for_reply_request_rtt(struct rpc_task *task);
 void			xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index bf490d0c98c6..23452f57d369 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -219,22 +219,39 @@ xprt_class_find_by_netid(const char *netid)
 }
 
 /**
- * xprt_load_transport - load a transport implementation
+ * xprt_find_transport_ident - convert a netid into a transport identifier
  * @netid: transport to load
  *
  * Returns:
- * 0:		transport successfully loaded
+ * > 0:		transport identifier
  * -ENOENT:	transport module not available
  */
-int xprt_load_transport(const char *netid)
+int xprt_find_transport_ident(const char *netid)
 {
 	const struct xprt_class *t;
+	int ret;
 
 	t = xprt_class_find_by_netid(netid);
 	if (!t)
 		return -ENOENT;
+	ret = t->ident;
 	xprt_class_release(t);
-	return 0;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xprt_find_transport_ident);
+
+/**
+ * xprt_load_transport - load a transport implementation
+ * @netid: transport to load
+ *
+ * Returns:
+ * 0:		transport successfully loaded
+ * -ENOENT:	transport module not available
+ */
+int xprt_load_transport(const char *netid)
+{
+	int ret = xprt_find_transport_ident(netid);
+	return ret < 0 ? ret : 0;
 }
 EXPORT_SYMBOL_GPL(xprt_load_transport);
 
-- 
cgit v1.2.3


From c87b056e58e71ba7a3f603700618f8da9742aa29 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 10 Nov 2020 10:32:14 -0500
Subject: SUNRPC: Remove unused function xprt_load_transport()

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprt.h |  1 -
 net/sunrpc/xprt.c           | 15 ---------------
 2 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index f7b75c72f80e..d2e97ee802af 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -385,7 +385,6 @@ xprt_disable_swap(struct rpc_xprt *xprt)
  */
 int			xprt_register_transport(struct xprt_class *type);
 int			xprt_unregister_transport(struct xprt_class *type);
-int			xprt_load_transport(const char *);
 int			xprt_find_transport_ident(const char *);
 void			xprt_wait_for_reply_request_def(struct rpc_task *task);
 void			xprt_wait_for_reply_request_rtt(struct rpc_task *task);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 23452f57d369..691ccf8049a4 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -240,21 +240,6 @@ int xprt_find_transport_ident(const char *netid)
 }
 EXPORT_SYMBOL_GPL(xprt_find_transport_ident);
 
-/**
- * xprt_load_transport - load a transport implementation
- * @netid: transport to load
- *
- * Returns:
- * 0:		transport successfully loaded
- * -ENOENT:	transport module not available
- */
-int xprt_load_transport(const char *netid)
-{
-	int ret = xprt_find_transport_ident(netid);
-	return ret < 0 ? ret : 0;
-}
-EXPORT_SYMBOL_GPL(xprt_load_transport);
-
 static void xprt_clear_locked(struct rpc_xprt *xprt)
 {
 	xprt->snd_task = NULL;
-- 
cgit v1.2.3


From 608af703519a58f5a7da4273809211cac27edfb2 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 19 Nov 2020 06:09:02 +0000
Subject: libfs: Add generic function for setting dentry_ops

This adds a function to set dentry operations at lookup time that will
work for both encrypted filenames and casefolded filenames.

A filesystem that supports both features simultaneously can use this
function during lookup preparations to set up its dentry operations once
fscrypt no longer does that itself.

Currently the casefolding dentry operation are always set if the
filesystem defines an encoding because the features is toggleable on
empty directories. Unlike in the encryption case, the dentry operations
used come from the parent. Since we don't know what set of functions
we'll eventually need, and cannot change them later, we enable the
casefolding operations if the filesystem supports them at all.

By splitting out the various cases, we support as few dentry operations
as we can get away with, maximizing compatibility with overlayfs, which
will not function if a filesystem supports certain dentry_operations.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/libfs.c         | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  1 +
 2 files changed, 71 insertions(+)

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index fc34361c1489..bac918699022 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1449,4 +1449,74 @@ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 	return 0;
 }
 EXPORT_SYMBOL(generic_ci_d_hash);
+
+static const struct dentry_operations generic_ci_dentry_ops = {
+	.d_hash = generic_ci_d_hash,
+	.d_compare = generic_ci_d_compare,
+};
+#endif
+
+#ifdef CONFIG_FS_ENCRYPTION
+static const struct dentry_operations generic_encrypted_dentry_ops = {
+	.d_revalidate = fscrypt_d_revalidate,
+};
+#endif
+
+#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
+static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
+	.d_hash = generic_ci_d_hash,
+	.d_compare = generic_ci_d_compare,
+	.d_revalidate = fscrypt_d_revalidate,
+};
+#endif
+
+/**
+ * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
+ * @dentry:	dentry to set ops on
+ *
+ * Casefolded directories need d_hash and d_compare set, so that the dentries
+ * contained in them are handled case-insensitively.  Note that these operations
+ * are needed on the parent directory rather than on the dentries in it, and
+ * while the casefolding flag can be toggled on and off on an empty directory,
+ * dentry_operations can't be changed later.  As a result, if the filesystem has
+ * casefolding support enabled at all, we have to give all dentries the
+ * casefolding operations even if their inode doesn't have the casefolding flag
+ * currently (and thus the casefolding ops would be no-ops for now).
+ *
+ * Encryption works differently in that the only dentry operation it needs is
+ * d_revalidate, which it only needs on dentries that have the no-key name flag.
+ * The no-key flag can't be set "later", so we don't have to worry about that.
+ *
+ * Finally, to maximize compatibility with overlayfs (which isn't compatible
+ * with certain dentry operations) and to avoid taking an unnecessary
+ * performance hit, we use custom dentry_operations for each possible
+ * combination rather than always installing all operations.
+ */
+void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
+{
+#ifdef CONFIG_FS_ENCRYPTION
+	bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
+#endif
+#ifdef CONFIG_UNICODE
+	bool needs_ci_ops = dentry->d_sb->s_encoding;
+#endif
+#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
+	if (needs_encrypt_ops && needs_ci_ops) {
+		d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
+		return;
+	}
 #endif
+#ifdef CONFIG_FS_ENCRYPTION
+	if (needs_encrypt_ops) {
+		d_set_d_op(dentry, &generic_encrypted_dentry_ops);
+		return;
+	}
+#endif
+#ifdef CONFIG_UNICODE
+	if (needs_ci_ops) {
+		d_set_d_op(dentry, &generic_ci_dentry_ops);
+		return;
+	}
+#endif
+}
+EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 21cc971fd960..4a25ab4dbd3e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3186,6 +3186,7 @@ extern int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
 extern int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 				const char *str, const struct qstr *name);
 #endif
+extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry);
 
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
-- 
cgit v1.2.3


From bb9cd9106b22b4fc5ff8d78a752be8a4ba2cbba5 Mon Sep 17 00:00:00 2001
From: Daniel Rosenberg <drosen@google.com>
Date: Thu, 19 Nov 2020 06:09:03 +0000
Subject: fscrypt: Have filesystems handle their d_ops

This shifts the responsibility of setting up dentry operations from
fscrypt to the individual filesystems, allowing them to have their own
operations while still setting fscrypt's d_revalidate as appropriate.

Most filesystems can just use generic_set_encrypted_ci_d_ops, unless
they have their own specific dentry operations as well. That operation
will set the minimal d_ops required under the circumstances.

Since the fscrypt d_ops are set later on, we must set all d_ops there,
since we cannot adjust those later on. This should not result in any
change in behavior.

Signed-off-by: Daniel Rosenberg <drosen@google.com>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Acked-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/crypto/fname.c           | 4 ----
 fs/crypto/fscrypt_private.h | 1 -
 fs/crypto/hooks.c           | 1 -
 fs/ext4/dir.c               | 7 -------
 fs/ext4/ext4.h              | 4 ----
 fs/ext4/namei.c             | 1 +
 fs/ext4/super.c             | 5 -----
 fs/f2fs/dir.c               | 7 -------
 fs/f2fs/f2fs.h              | 3 ---
 fs/f2fs/namei.c             | 1 +
 fs/f2fs/super.c             | 1 -
 fs/ubifs/dir.c              | 1 +
 include/linux/fscrypt.h     | 7 +++++--
 13 files changed, 8 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 1fbe6c24d705..cb3cfa6329ba 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -570,7 +570,3 @@ int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
 	return valid;
 }
 EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);
-
-const struct dentry_operations fscrypt_d_ops = {
-	.d_revalidate = fscrypt_d_revalidate,
-};
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 4f5806a3b73d..df9c48c1fbf7 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -294,7 +294,6 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
 bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
 				  u32 orig_len, u32 max_len,
 				  u32 *encrypted_len_ret);
-extern const struct dentry_operations fscrypt_d_ops;
 
 /* hkdf.c */
 
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 20b0df47fe6a..9006fa983335 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -117,7 +117,6 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags |= DCACHE_NOKEY_NAME;
 		spin_unlock(&dentry->d_lock);
-		d_set_d_op(dentry, &fscrypt_d_ops);
 	}
 	return err;
 }
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ca50c90adc4c..e757319a4472 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -667,10 +667,3 @@ const struct file_operations ext4_dir_operations = {
 	.open		= ext4_dir_open,
 	.release	= ext4_release_dir,
 };
-
-#ifdef CONFIG_UNICODE
-const struct dentry_operations ext4_dentry_ops = {
-	.d_hash = generic_ci_d_hash,
-	.d_compare = generic_ci_d_compare,
-};
-#endif
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 45fcdbf538d1..983f2b970d6a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3354,10 +3354,6 @@ static inline void ext4_unlock_group(struct super_block *sb,
 /* dir.c */
 extern const struct file_operations ext4_dir_operations;
 
-#ifdef CONFIG_UNICODE
-extern const struct dentry_operations ext4_dentry_ops;
-#endif
-
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f458d1d81d96..8e2398e5d0fe 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1614,6 +1614,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
 	struct buffer_head *bh;
 
 	err = ext4_fname_prepare_lookup(dir, dentry, &fname);
+	generic_set_encrypted_ci_d_ops(dentry);
 	if (err == -ENOENT)
 		return NULL;
 	if (err)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ef4734b40e2a..82b365acedf8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4974,11 +4974,6 @@ no_journal:
 		goto failed_mount4;
 	}
 
-#ifdef CONFIG_UNICODE
-	if (sb->s_encoding)
-		sb->s_d_op = &ext4_dentry_ops;
-#endif
-
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		ext4_msg(sb, KERN_ERR, "get root dentry failed");
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 4b9ef8bbfa4a..71fdf5076461 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -1099,10 +1099,3 @@ const struct file_operations f2fs_dir_operations = {
 	.compat_ioctl   = f2fs_compat_ioctl,
 #endif
 };
-
-#ifdef CONFIG_UNICODE
-const struct dentry_operations f2fs_dentry_ops = {
-	.d_hash = generic_ci_d_hash,
-	.d_compare = generic_ci_d_compare,
-};
-#endif
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 99bcf4b44a9c..01fd42843e49 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3688,9 +3688,6 @@ static inline void f2fs_update_sit_info(struct f2fs_sb_info *sbi) {}
 #endif
 
 extern const struct file_operations f2fs_dir_operations;
-#ifdef CONFIG_UNICODE
-extern const struct dentry_operations f2fs_dentry_ops;
-#endif
 extern const struct file_operations f2fs_file_operations;
 extern const struct inode_operations f2fs_file_inode_operations;
 extern const struct address_space_operations f2fs_dblock_aops;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 8fa37d1434de..6edb1ab579a1 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -497,6 +497,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	err = f2fs_prepare_lookup(dir, dentry, &fname);
+	generic_set_encrypted_ci_d_ops(dentry);
 	if (err == -ENOENT)
 		goto out_splice;
 	if (err)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 0ec292d7fcdb..08e63c8caa1e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3426,7 +3426,6 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
 
 		sbi->sb->s_encoding = encoding;
 		sbi->sb->s_encoding_flags = encoding_flags;
-		sbi->sb->s_d_op = &f2fs_dentry_ops;
 	}
 #else
 	if (f2fs_sb_has_casefold(sbi)) {
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 155521e51ac5..7a920434d741 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -203,6 +203,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 	dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
 
 	err = fscrypt_prepare_lookup(dir, dentry, &nm);
+	generic_set_encrypted_ci_d_ops(dentry);
 	if (err == -ENOENT)
 		return d_splice_alias(NULL, dentry);
 	if (err)
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index a8f7a43f031b..e72f80482671 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -741,8 +741,11 @@ static inline int fscrypt_prepare_rename(struct inode *old_dir,
  * directory's encryption key is available, then the lookup is assumed to be by
  * plaintext name; otherwise, it is assumed to be by no-key name.
  *
- * This also installs a custom ->d_revalidate() method which will invalidate the
- * dentry if it was created without the key and the key is later added.
+ * This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key
+ * name.  In this case the filesystem must assign the dentry a dentry_operations
+ * which contains fscrypt_d_revalidate (or contains a d_revalidate method that
+ * calls fscrypt_d_revalidate), so that the dentry will be invalidated if the
+ * directory's encryption key is later added.
  *
  * Return: 0 on success; -ENOENT if the directory's key is unavailable but the
  * filename isn't a valid no-key name, so a negative dentry should be created;
-- 
cgit v1.2.3


From b28f047b28c51d0b9864c34b097bb0b221ea7247 Mon Sep 17 00:00:00 2001
From: Chao Yu <yuchao0@huawei.com>
Date: Thu, 26 Nov 2020 18:32:09 +0800
Subject: f2fs: compress: support chksum

This patch supports to store chksum value with compressed
data, and verify the integrality of compressed data while
reading the data.

The feature can be enabled through specifying mount option
'compress_chksum'.

Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 Documentation/filesystems/f2fs.rst |  1 +
 fs/f2fs/compress.c                 | 23 +++++++++++++++++++++++
 fs/f2fs/f2fs.h                     | 16 ++++++++++++++--
 fs/f2fs/inode.c                    |  3 +++
 fs/f2fs/super.c                    |  9 +++++++++
 include/linux/f2fs_fs.h            |  2 +-
 6 files changed, 51 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index b8ee761c9922..985ae7d35066 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -260,6 +260,7 @@ compress_extension=%s	 Support adding specified extension, so that f2fs can enab
 			 For other files, we can still enable compression via ioctl.
 			 Note that, there is one reserved special extension '*', it
 			 can be set to enable compression for all files.
+compress_chksum		 Support verifying chksum of raw data in compressed cluster.
 inlinecrypt		 When possible, encrypt/decrypt the contents of encrypted
 			 files using the blk-crypto framework rather than
 			 filesystem-layer encryption. This allows the use of
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 14262e0f1cd6..7ec1592a0973 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -602,6 +602,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
 				f2fs_cops[fi->i_compress_algorithm];
 	unsigned int max_len, new_nr_cpages;
 	struct page **new_cpages;
+	u32 chksum = 0;
 	int i, ret;
 
 	trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx,
@@ -655,6 +656,11 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
 
 	cc->cbuf->clen = cpu_to_le32(cc->clen);
 
+	if (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)
+		chksum = f2fs_crc32(F2FS_I_SB(cc->inode),
+					cc->cbuf->cdata, cc->clen);
+	cc->cbuf->chksum = cpu_to_le32(chksum);
+
 	for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++)
 		cc->cbuf->reserved[i] = cpu_to_le32(0);
 
@@ -790,6 +796,23 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
 
 	ret = cops->decompress_pages(dic);
 
+	if (!ret && fi->i_compress_flag & 1 << COMPRESS_CHKSUM) {
+		u32 provided = le32_to_cpu(dic->cbuf->chksum);
+		u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen);
+
+		if (provided != calculated) {
+			if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) {
+				set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT);
+				printk_ratelimited(
+					"%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x",
+					KERN_INFO, sbi->sb->s_id, dic->inode->i_ino,
+					provided, calculated);
+			}
+			set_sbi_flag(sbi, SBI_NEED_FSCK);
+			WARN_ON_ONCE(1);
+		}
+	}
+
 out_vunmap_cbuf:
 	vm_unmap_ram(dic->cbuf, dic->nr_cpages);
 out_vunmap_rbuf:
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 0d25f5ca5618..0b314b2034d8 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -147,7 +147,8 @@ struct f2fs_mount_info {
 
 	/* For compression */
 	unsigned char compress_algorithm;	/* algorithm type */
-	unsigned compress_log_size;		/* cluster log size */
+	unsigned char compress_log_size;	/* cluster log size */
+	bool compress_chksum;			/* compressed data chksum */
 	unsigned char compress_ext_cnt;		/* extension count */
 	unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN];	/* extensions */
 };
@@ -676,6 +677,7 @@ enum {
 	FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
 	FI_VERITY_IN_PROGRESS,	/* building fs-verity Merkle tree */
 	FI_COMPRESSED_FILE,	/* indicate file's data can be compressed */
+	FI_COMPRESS_CORRUPT,	/* indicate compressed cluster is corrupted */
 	FI_MMAP_FILE,		/* indicate file was mmapped */
 	FI_MAX,			/* max flag, never be used */
 };
@@ -733,6 +735,7 @@ struct f2fs_inode_info {
 	atomic_t i_compr_blocks;		/* # of compressed blocks */
 	unsigned char i_compress_algorithm;	/* algorithm type */
 	unsigned char i_log_cluster_size;	/* log of cluster size */
+	unsigned short i_compress_flag;		/* compress flag */
 	unsigned int i_cluster_size;		/* cluster size */
 };
 
@@ -1272,9 +1275,15 @@ enum compress_algorithm_type {
 	COMPRESS_MAX,
 };
 
-#define COMPRESS_DATA_RESERVED_SIZE		5
+enum compress_flag {
+	COMPRESS_CHKSUM,
+	COMPRESS_MAX_FLAG,
+};
+
+#define COMPRESS_DATA_RESERVED_SIZE		4
 struct compress_data {
 	__le32 clen;			/* compressed data size */
+	__le32 chksum;			/* compressed data chksum */
 	__le32 reserved[COMPRESS_DATA_RESERVED_SIZE];	/* reserved */
 	u8 cdata[];			/* compressed data */
 };
@@ -3888,6 +3897,9 @@ static inline void set_compress_context(struct inode *inode)
 			F2FS_OPTION(sbi).compress_algorithm;
 	F2FS_I(inode)->i_log_cluster_size =
 			F2FS_OPTION(sbi).compress_log_size;
+	F2FS_I(inode)->i_compress_flag =
+			F2FS_OPTION(sbi).compress_chksum ?
+				1 << COMPRESS_CHKSUM : 0;
 	F2FS_I(inode)->i_cluster_size =
 			1 << F2FS_I(inode)->i_log_cluster_size;
 	F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 657db2fb6739..349d9cb933ee 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -456,6 +456,7 @@ static int do_read_inode(struct inode *inode)
 					le64_to_cpu(ri->i_compr_blocks));
 			fi->i_compress_algorithm = ri->i_compress_algorithm;
 			fi->i_log_cluster_size = ri->i_log_cluster_size;
+			fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag);
 			fi->i_cluster_size = 1 << fi->i_log_cluster_size;
 			set_inode_flag(inode, FI_COMPRESSED_FILE);
 		}
@@ -634,6 +635,8 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
 					&F2FS_I(inode)->i_compr_blocks));
 			ri->i_compress_algorithm =
 				F2FS_I(inode)->i_compress_algorithm;
+			ri->i_compress_flag =
+				cpu_to_le16(F2FS_I(inode)->i_compress_flag);
 			ri->i_log_cluster_size =
 				F2FS_I(inode)->i_log_cluster_size;
 		}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 82baaa89c893..f3d919ee4dee 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -146,6 +146,7 @@ enum {
 	Opt_compress_algorithm,
 	Opt_compress_log_size,
 	Opt_compress_extension,
+	Opt_compress_chksum,
 	Opt_atgc,
 	Opt_err,
 };
@@ -214,6 +215,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_compress_algorithm, "compress_algorithm=%s"},
 	{Opt_compress_log_size, "compress_log_size=%u"},
 	{Opt_compress_extension, "compress_extension=%s"},
+	{Opt_compress_chksum, "compress_chksum"},
 	{Opt_atgc, "atgc"},
 	{Opt_err, NULL},
 };
@@ -934,10 +936,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 			F2FS_OPTION(sbi).compress_ext_cnt++;
 			kfree(name);
 			break;
+		case Opt_compress_chksum:
+			F2FS_OPTION(sbi).compress_chksum = true;
+			break;
 #else
 		case Opt_compress_algorithm:
 		case Opt_compress_log_size:
 		case Opt_compress_extension:
+		case Opt_compress_chksum:
 			f2fs_info(sbi, "compression options not supported");
 			break;
 #endif
@@ -1523,6 +1529,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
 		seq_printf(seq, ",compress_extension=%s",
 			F2FS_OPTION(sbi).extensions[i]);
 	}
+
+	if (F2FS_OPTION(sbi).compress_chksum)
+		seq_puts(seq, ",compress_chksum");
 }
 
 static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index a5dbb57a687f..7dc2a06cf19a 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -273,7 +273,7 @@ struct f2fs_inode {
 			__le64 i_compr_blocks;	/* # of compressed blocks */
 			__u8 i_compress_algorithm;	/* compress algorithm */
 			__u8 i_log_cluster_size;	/* log of cluster size */
-			__le16 i_padding;		/* padding */
+			__le16 i_compress_flag;		/* compress flag */
 			__le32 i_extra_end[0];	/* for attribute size calculation */
 		} __packed;
 		__le32 i_addr[DEF_ADDRS_PER_INODE];	/* Pointers to data blocks */
-- 
cgit v1.2.3


From 950cc0d2bef078e1f6459900ca4d4b2a2e0e3c37 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 2 Dec 2020 14:07:07 +0200
Subject: fsnotify: generalize handle_inode_event()

The handle_inode_event() interface was added as (quoting comment):
"a simple variant of handle_event() for groups that only have inode
marks and don't have ignore mask".

In other words, all backends except fanotify.  The inotify backend
also falls under this category, but because it required extra arguments
it was left out of the initial pass of backends conversion to the
simple interface.

This results in code duplication between the generic helper
fsnotify_handle_event() and the inotify_handle_event() callback
which also happen to be buggy code.

Generalize the handle_inode_event() arguments and add the check for
FS_EXCL_UNLINK flag to the generic helper, so inotify backend could
be converted to use the simple interface.

Link: https://lore.kernel.org/r/20201202120713.702387-2-amir73il@gmail.com
CC: stable@vger.kernel.org
Fixes: b9a1b9772509 ("fsnotify: create method handle_inode_event() in fsnotify_operations")
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/nfsd/filecache.c              |  2 +-
 fs/notify/dnotify/dnotify.c      |  2 +-
 fs/notify/fsnotify.c             | 31 ++++++++++++++++++++++++-------
 include/linux/fsnotify_backend.h |  3 ++-
 kernel/audit_fsnotify.c          |  2 +-
 kernel/audit_tree.c              |  2 +-
 kernel/audit_watch.c             |  2 +-
 7 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 3c6c2f7d1688..5849c1bd88f1 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -600,7 +600,7 @@ static struct notifier_block nfsd_file_lease_notifier = {
 static int
 nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask,
 				struct inode *inode, struct inode *dir,
-				const struct qstr *name)
+				const struct qstr *name, u32 cookie)
 {
 	trace_nfsd_file_fsnotify_handle_event(inode, mask);
 
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5dcda8f20c04..e45ca6ecba95 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -72,7 +72,7 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
  */
 static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
 				struct inode *inode, struct inode *dir,
-				const struct qstr *name)
+				const struct qstr *name, u32 cookie)
 {
 	struct dnotify_mark *dn_mark;
 	struct dnotify_struct *dn;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 8d3ad5ef2925..c5c68bcbaadf 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -232,6 +232,26 @@ notify:
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
+static int fsnotify_handle_inode_event(struct fsnotify_group *group,
+				       struct fsnotify_mark *inode_mark,
+				       u32 mask, const void *data, int data_type,
+				       struct inode *dir, const struct qstr *name,
+				       u32 cookie)
+{
+	const struct path *path = fsnotify_data_path(data, data_type);
+	struct inode *inode = fsnotify_data_inode(data, data_type);
+	const struct fsnotify_ops *ops = group->ops;
+
+	if (WARN_ON_ONCE(!ops->handle_inode_event))
+		return 0;
+
+	if ((inode_mark->mask & FS_EXCL_UNLINK) &&
+	    path && d_unlinked(path->dentry))
+		return 0;
+
+	return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie);
+}
+
 static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
 				 const void *data, int data_type,
 				 struct inode *dir, const struct qstr *name,
@@ -239,13 +259,8 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
 {
 	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
 	struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info);
-	struct inode *inode = fsnotify_data_inode(data, data_type);
-	const struct fsnotify_ops *ops = group->ops;
 	int ret;
 
-	if (WARN_ON_ONCE(!ops->handle_inode_event))
-		return 0;
-
 	if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) ||
 	    WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
 		return 0;
@@ -262,7 +277,8 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
 		name = NULL;
 	}
 
-	ret = ops->handle_inode_event(inode_mark, mask, inode, dir, name);
+	ret = fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
+					  dir, name, cookie);
 	if (ret || !child_mark)
 		return ret;
 
@@ -272,7 +288,8 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
 	 * report the event once to parent dir with name and once to child
 	 * without name.
 	 */
-	return ops->handle_inode_event(child_mark, mask, inode, NULL, NULL);
+	return fsnotify_handle_inode_event(group, child_mark, mask, data, data_type,
+					   NULL, NULL, 0);
 }
 
 static int send_to_group(__u32 mask, const void *data, int data_type,
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index f8529a3a2923..4ee3044eedd0 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -137,6 +137,7 @@ struct mem_cgroup;
  *		if @file_name is not NULL, this is the directory that
  *		@file_name is relative to.
  * @file_name:	optional file name associated with event
+ * @cookie:	inotify rename cookie
  *
  * free_group_priv - called when a group refcnt hits 0 to clean up the private union
  * freeing_mark - called when a mark is being destroyed for some reason.  The group
@@ -151,7 +152,7 @@ struct fsnotify_ops {
 			    struct fsnotify_iter_info *iter_info);
 	int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask,
 			    struct inode *inode, struct inode *dir,
-			    const struct qstr *file_name);
+			    const struct qstr *file_name, u32 cookie);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
 	void (*free_event)(struct fsnotify_event *event);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index bfcfcd61adb6..5b3f01da172b 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -154,7 +154,7 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
 /* Update mark data in audit rules based on fsnotify events. */
 static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
 				   struct inode *inode, struct inode *dir,
-				   const struct qstr *dname)
+				   const struct qstr *dname, u32 cookie)
 {
 	struct audit_fsnotify_mark *audit_mark;
 
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 83e1c07fc99e..6c91902f4f45 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1037,7 +1037,7 @@ static void evict_chunk(struct audit_chunk *chunk)
 
 static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask,
 				   struct inode *inode, struct inode *dir,
-				   const struct qstr *file_name)
+				   const struct qstr *file_name, u32 cookie)
 {
 	return 0;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 246e5ba704c0..2acf7ca49154 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -466,7 +466,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
 /* Update watch data in audit rules based on fsnotify events. */
 static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
 				    struct inode *inode, struct inode *dir,
-				    const struct qstr *dname)
+				    const struct qstr *dname, u32 cookie)
 {
 	struct audit_parent *parent;
 
-- 
cgit v1.2.3


From 50a4952fd67b7f7f551e82ac07c51c1a7a74d474 Mon Sep 17 00:00:00 2001
From: Alexander Lochmann <alexander.lochmann@tu-dortmund.de>
Date: Thu, 15 Oct 2020 15:24:52 +0200
Subject: Updated locking documentation for transaction_t

We used LockDoc to derive locking rules for each member
of struct transaction_t.
Based on those results, we extended the existing documentation
by more members of struct transaction_t, and updated the existing
documentation.

Link: https://lore.kernel.org/r/10cfbef1-994c-c604-f8a6-b1042fcc622f@tu-dortmund.de
Signed-off-by: Alexander Lochmann <alexander.lochmann@tu-dortmund.de>
Signed-off-by: Horst Schirmeier <horst.schirmeier@tu-dortmund.de>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 include/linux/jbd2.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 578ff196b3ce..d2a4860feb72 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -538,6 +538,7 @@ struct transaction_chp_stats_s {
  * The transaction keeps track of all of the buffers modified by a
  * running transaction, and all of the buffers committed but not yet
  * flushed to home for finished transactions.
+ * (Locking Documentation improved by LockDoc)
  */
 
 /*
@@ -658,12 +659,12 @@ struct transaction_s
 	unsigned long		t_start;
 
 	/*
-	 * When commit was requested
+	 * When commit was requested [j_state_lock]
 	 */
 	unsigned long		t_requested;
 
 	/*
-	 * Checkpointing stats [j_checkpoint_sem]
+	 * Checkpointing stats [j_list_lock]
 	 */
 	struct transaction_chp_stats_s t_chp_stats;
 
-- 
cgit v1.2.3


From 14026b94ccfe626e512bc9fa01e0e72ee75c7a98 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Tue, 18 Aug 2020 17:19:32 +0000
Subject: signal: Add unsafe_put_compat_sigset()

Implement 'unsafe' version of put_compat_sigset()

For the bigendian, use unsafe_put_user() directly
to avoid intermediate copy through the stack.

For the littleendian, use a straight unsafe_copy_to_user().

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/537c7082ee309a0bb9c67a50c5d9dd929aedb82d.1597770847.git.christophe.leroy@csgroup.eu
---
 include/linux/compat.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 14d514233e1d..400c0941c8af 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -442,6 +442,38 @@ put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
 #endif
 }
 
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define unsafe_put_compat_sigset(compat, set, label) do {		\
+	compat_sigset_t __user *__c = compat;				\
+	const sigset_t *__s = set;					\
+									\
+	switch (_NSIG_WORDS) {						\
+	case 4:								\
+		unsafe_put_user(__s->sig[3] >> 32, &__c->sig[7], label);	\
+		unsafe_put_user(__s->sig[3], &__c->sig[6], label);	\
+		fallthrough;						\
+	case 3:								\
+		unsafe_put_user(__s->sig[2] >> 32, &__c->sig[5], label);	\
+		unsafe_put_user(__s->sig[2], &__c->sig[4], label);	\
+		fallthrough;						\
+	case 2:								\
+		unsafe_put_user(__s->sig[1] >> 32, &__c->sig[3], label);	\
+		unsafe_put_user(__s->sig[1], &__c->sig[2], label);	\
+		fallthrough;						\
+	case 1:								\
+		unsafe_put_user(__s->sig[0] >> 32, &__c->sig[1], label);	\
+		unsafe_put_user(__s->sig[0], &__c->sig[0], label);	\
+	}								\
+} while (0)
+#else
+#define unsafe_put_compat_sigset(compat, set, label) do {		\
+	compat_sigset_t __user *__c = compat;				\
+	const sigset_t *__s = set;					\
+									\
+	unsafe_copy_to_user(__c, __s, sizeof(*__c), label);		\
+} while (0)
+#endif
+
 extern int compat_ptrace_request(struct task_struct *child,
 				 compat_long_t request,
 				 compat_ulong_t addr, compat_ulong_t data);
-- 
cgit v1.2.3


From a15ac665b9e9c90b1557499f2a46c1e89d29154a Mon Sep 17 00:00:00 2001
From: Eric Farman <farman@linux.ibm.com>
Date: Thu, 3 Dec 2020 22:35:11 +0100
Subject: vfio-mdev: Wire in a request handler for mdev parent

While performing some destructive tests with vfio-ccw, where the
paths to a device are forcible removed and thus the device itself
is unreachable, it is rather easy to end up in an endless loop in
vfio_del_group_dev() due to the lack of a request callback for the
associated device.

In this example, one MDEV (77c) is used by a guest, while another
(77b) is not. The symptom is that the iommu is detached from the
mdev for 77b, but not 77c, until that guest is shutdown:

    [  238.794867] vfio_ccw 0.0.077b: MDEV: Unregistering
    [  238.794996] vfio_mdev 11f2d2bc-4083-431d-a023-eff72715c4f0: Removing from iommu group 2
    [  238.795001] vfio_mdev 11f2d2bc-4083-431d-a023-eff72715c4f0: MDEV: detaching iommu
    [  238.795036] vfio_ccw 0.0.077c: MDEV: Unregistering
    ...silence...

Let's wire in the request call back to the mdev device, so that a
device being physically removed from the host can be (gracefully?)
handled by the parent device at the time the device is removed.

Add a message when registering the device if a driver doesn't
provide this callback, so a clue is given that this same loop
may be encountered in a similar situation, and a message when
this occurs instead of the awkward silence noted above.

Signed-off-by: Eric Farman <farman@linux.ibm.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/mdev/mdev_core.c |  4 ++++
 drivers/vfio/mdev/vfio_mdev.c | 13 +++++++++++++
 include/linux/mdev.h          |  4 ++++
 3 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index b558d4cfd082..6de97d25a3f8 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -154,6 +154,10 @@ int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops)
 	if (!dev)
 		return -EINVAL;
 
+	/* Not mandatory, but its absence could be a problem */
+	if (!ops->request)
+		dev_info(dev, "Driver cannot be asked to release device\n");
+
 	mutex_lock(&parent_list_lock);
 
 	/* Check for duplicate */
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index 30964a4e0a28..b52eea128549 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -98,6 +98,18 @@ static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma)
 	return parent->ops->mmap(mdev, vma);
 }
 
+static void vfio_mdev_request(void *device_data, unsigned int count)
+{
+	struct mdev_device *mdev = device_data;
+	struct mdev_parent *parent = mdev->parent;
+
+	if (parent->ops->request)
+		parent->ops->request(mdev, count);
+	else if (count == 0)
+		dev_notice(mdev_dev(mdev),
+			   "No mdev vendor driver request callback support, blocked until released by user\n");
+}
+
 static const struct vfio_device_ops vfio_mdev_dev_ops = {
 	.name		= "vfio-mdev",
 	.open		= vfio_mdev_open,
@@ -106,6 +118,7 @@ static const struct vfio_device_ops vfio_mdev_dev_ops = {
 	.read		= vfio_mdev_read,
 	.write		= vfio_mdev_write,
 	.mmap		= vfio_mdev_mmap,
+	.request	= vfio_mdev_request,
 };
 
 static int vfio_mdev_probe(struct device *dev)
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 0ce30ca78db0..9004375c462e 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -72,6 +72,9 @@ struct device *mdev_get_iommu_device(struct device *dev);
  * @mmap:		mmap callback
  *			@mdev: mediated device structure
  *			@vma: vma structure
+ * @request:		request callback to release device
+ *			@mdev: mediated device structure
+ *			@count: request sequence number
  * Parent device that support mediated device should be registered with mdev
  * module with mdev_parent_ops structure.
  **/
@@ -92,6 +95,7 @@ struct mdev_parent_ops {
 	long	(*ioctl)(struct mdev_device *mdev, unsigned int cmd,
 			 unsigned long arg);
 	int	(*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma);
+	void	(*request)(struct mdev_device *mdev, unsigned int count);
 };
 
 /* interface for exporting mdev supported type attributes */
-- 
cgit v1.2.3


From 6df3e14436f6ee254b1a4952d90ee8988be59c89 Mon Sep 17 00:00:00 2001
From: David Brazdil <dbrazdil@google.com>
Date: Wed, 2 Dec 2020 18:41:02 +0000
Subject: psci: Add accessor for psci_0_1_function_ids

Make it possible to retrieve a copy of the psci_0_1_function_ids struct.
This is useful for KVM if it is configured to intercept host's PSCI SMCs.

Signed-off-by: David Brazdil <dbrazdil@google.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20201202184122.26046-7-dbrazdil@google.com
---
 drivers/firmware/psci/psci.c | 12 +++++-------
 include/linux/psci.h         |  9 +++++++++
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index 593fdd0e09a2..f5fc429cae3f 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -58,15 +58,13 @@ typedef unsigned long (psci_fn)(unsigned long, unsigned long,
 				unsigned long, unsigned long);
 static psci_fn *invoke_psci_fn;
 
-struct psci_0_1_function_ids {
-	u32 cpu_suspend;
-	u32 cpu_on;
-	u32 cpu_off;
-	u32 migrate;
-};
-
 static struct psci_0_1_function_ids psci_0_1_function_ids;
 
+struct psci_0_1_function_ids get_psci_0_1_function_ids(void)
+{
+	return psci_0_1_function_ids;
+}
+
 #define PSCI_0_2_POWER_STATE_MASK		\
 				(PSCI_0_2_POWER_STATE_ID_MASK | \
 				PSCI_0_2_POWER_STATE_TYPE_MASK | \
diff --git a/include/linux/psci.h b/include/linux/psci.h
index 2a1bfb890e58..4ca0060a3fc4 100644
--- a/include/linux/psci.h
+++ b/include/linux/psci.h
@@ -34,6 +34,15 @@ struct psci_operations {
 
 extern struct psci_operations psci_ops;
 
+struct psci_0_1_function_ids {
+	u32 cpu_suspend;
+	u32 cpu_on;
+	u32 cpu_off;
+	u32 migrate;
+};
+
+struct psci_0_1_function_ids get_psci_0_1_function_ids(void);
+
 #if defined(CONFIG_ARM_PSCI_FW)
 int __init psci_dt_init(void);
 #else
-- 
cgit v1.2.3


From 2d26c716fc49f41a63e1efe8f1f772b0adeaacef Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 7 Dec 2020 10:13:08 +0100
Subject: module: drop semicolon from version macro

Drop the trailing semicolon from the MODULE_VERSION() macro definition
which was left when removing the array-of-pointer indirection.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 5958075ea3f4..e7a619c2457e 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -279,7 +279,7 @@ extern typeof(name) __mod_##type##__##name##_device_table		\
 			},						\
 			.module_name	= KBUILD_MODNAME,		\
 			.version	= _version,			\
-		};
+		}
 #endif
 
 /* Optional firmware file (or files) needed by the module
-- 
cgit v1.2.3


From 0eba770790426553f45b8643bcd77b854e045057 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Thu, 5 Nov 2020 20:27:45 +0100
Subject: clk: composite: add devm_clk_hw_register_composite_pdata()

This will simplify drivers which would only unregister the clk in their
remove() op.

Signed-off-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20201105192746.19564-3-michael@walle.cc
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-composite.c  | 50 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h |  7 +++++++
 2 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
index 2ddb54f7d3ab..0506046a5f4b 100644
--- a/drivers/clk/clk-composite.c
+++ b/drivers/clk/clk-composite.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/clk-provider.h>
+#include <linux/device.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 
@@ -405,3 +406,52 @@ void clk_hw_unregister_composite(struct clk_hw *hw)
 	kfree(composite);
 }
 EXPORT_SYMBOL_GPL(clk_hw_unregister_composite);
+
+static void devm_clk_hw_release_composite(struct device *dev, void *res)
+{
+	clk_hw_unregister_composite(*(struct clk_hw **)res);
+}
+
+static struct clk_hw *__devm_clk_hw_register_composite(struct device *dev,
+			const char *name, const char * const *parent_names,
+			const struct clk_parent_data *pdata, int num_parents,
+			struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+			struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
+			struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+			unsigned long flags)
+{
+	struct clk_hw **ptr, *hw;
+
+	ptr = devres_alloc(devm_clk_hw_release_composite, sizeof(*ptr),
+			   GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	hw = __clk_hw_register_composite(dev, name, parent_names, pdata,
+					 num_parents, mux_hw, mux_ops, rate_hw,
+					 rate_ops, gate_hw, gate_ops, flags);
+
+	if (!IS_ERR(hw)) {
+		*ptr = hw;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return hw;
+}
+
+struct clk_hw *devm_clk_hw_register_composite_pdata(struct device *dev,
+			const char *name,
+			const struct clk_parent_data *parent_data,
+			int num_parents,
+			struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+			struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
+			struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+			unsigned long flags)
+{
+	return __devm_clk_hw_register_composite(dev, name, NULL, parent_data,
+						num_parents, mux_hw, mux_ops,
+						rate_hw, rate_ops, gate_hw,
+						gate_ops, flags);
+}
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 03a5de5f99f4..33db52ff83a0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1062,6 +1062,13 @@ struct clk_hw *clk_hw_register_composite_pdata(struct device *dev,
 		struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
 		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
 		unsigned long flags);
+struct clk_hw *devm_clk_hw_register_composite_pdata(struct device *dev,
+		const char *name, const struct clk_parent_data *parent_data,
+		int num_parents,
+		struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+		struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
+		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+		unsigned long flags);
 void clk_hw_unregister_composite(struct clk_hw *hw);
 
 struct clk *clk_register(struct device *dev, struct clk_hw *hw);
-- 
cgit v1.2.3


From d9a9280a0d0ae51dc1d4142138b99242b7ec8ac6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 26 Oct 2020 17:10:58 +0100
Subject: seq_buf: Avoid type mismatch for seq_buf_init

Building with W=2 prints a number of warnings for one function that
has a pointer type mismatch:

linux/seq_buf.h: In function 'seq_buf_init':
linux/seq_buf.h:35:12: warning: pointer targets in assignment from 'unsigned char *' to 'char *' differ in signedness [-Wpointer-sign]

Change the type in the function prototype according to the type in
the structure.

Link: https://lkml.kernel.org/r/20201026161108.3707783-1-arnd@kernel.org

Fixes: 9a7777935c34 ("tracing: Convert seq_buf fields to be like seq_file fields")
Reviewed-by: Cezary Rojewski <cezary.rojewski@intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/seq_buf.h   | 2 +-
 include/linux/trace_seq.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
index fb0205d87d3c..9d6c28cc4d8f 100644
--- a/include/linux/seq_buf.h
+++ b/include/linux/seq_buf.h
@@ -30,7 +30,7 @@ static inline void seq_buf_clear(struct seq_buf *s)
 }
 
 static inline void
-seq_buf_init(struct seq_buf *s, unsigned char *buf, unsigned int size)
+seq_buf_init(struct seq_buf *s, char *buf, unsigned int size)
 {
 	s->buffer = buf;
 	s->size = size;
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 6c30508fca19..5a2c650d9e1c 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -12,7 +12,7 @@
  */
 
 struct trace_seq {
-	unsigned char		buffer[PAGE_SIZE];
+	char			buffer[PAGE_SIZE];
 	struct seq_buf		seq;
 	int			full;
 };
@@ -51,7 +51,7 @@ static inline int trace_seq_used(struct trace_seq *s)
  * that is about to be written to and then return the result
  * of that write.
  */
-static inline unsigned char *
+static inline char *
 trace_seq_buffer_ptr(struct trace_seq *s)
 {
 	return s->buffer + seq_buf_used(&s->seq);
-- 
cgit v1.2.3


From 26792699fe3681102aa85f4ae6d39e80a6a7e6b6 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Sun, 8 Nov 2020 19:51:09 +0100
Subject: clk: divider: add devm_clk_hw_register_divider_table()

This will simplify drivers which would only unregister the clk in their
remove() op.

Signed-off-by: Michael Walle <michael@walle.cc>
Link: https://lore.kernel.org/r/20201108185113.31377-6-michael@walle.cc
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-divider.c    | 34 ++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h | 27 +++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c
index 8de12cb0c43d..c499799693cc 100644
--- a/drivers/clk/clk-divider.c
+++ b/drivers/clk/clk-divider.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/clk-provider.h>
+#include <linux/device.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/io.h>
@@ -578,3 +579,36 @@ void clk_hw_unregister_divider(struct clk_hw *hw)
 	kfree(div);
 }
 EXPORT_SYMBOL_GPL(clk_hw_unregister_divider);
+
+static void devm_clk_hw_release_divider(struct device *dev, void *res)
+{
+	clk_hw_unregister_divider(*(struct clk_hw **)res);
+}
+
+struct clk_hw *__devm_clk_hw_register_divider(struct device *dev,
+		struct device_node *np, const char *name,
+		const char *parent_name, const struct clk_hw *parent_hw,
+		const struct clk_parent_data *parent_data, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags,
+		const struct clk_div_table *table, spinlock_t *lock)
+{
+	struct clk_hw **ptr, *hw;
+
+	ptr = devres_alloc(devm_clk_hw_release_divider, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	hw = __clk_hw_register_divider(dev, np, name, parent_name, parent_hw,
+				       parent_data, flags, reg, shift, width,
+				       clk_divider_flags, table, lock);
+
+	if (!IS_ERR(hw)) {
+		*ptr = hw;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return hw;
+}
+EXPORT_SYMBOL_GPL(__devm_clk_hw_register_divider);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 33db52ff83a0..5f896df01f83 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -639,6 +639,12 @@ struct clk_hw *__clk_hw_register_divider(struct device *dev,
 		const struct clk_parent_data *parent_data, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags,
 		const struct clk_div_table *table, spinlock_t *lock);
+struct clk_hw *__devm_clk_hw_register_divider(struct device *dev,
+		struct device_node *np, const char *name,
+		const char *parent_name, const struct clk_hw *parent_hw,
+		const struct clk_parent_data *parent_data, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width, u8 clk_divider_flags,
+		const struct clk_div_table *table, spinlock_t *lock);
 struct clk *clk_register_divider_table(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
@@ -779,6 +785,27 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name,
 				  (parent_data), (flags), (reg), (shift),     \
 				  (width), (clk_divider_flags), (table),      \
 				  (lock))
+/**
+ * devm_clk_hw_register_divider_table - register a table based divider clock
+ * with the clock framework (devres variant)
+ * @dev: device registering this clock
+ * @name: name of this clock
+ * @parent_name: name of clock's parent
+ * @flags: framework-specific flags
+ * @reg: register address to adjust divider
+ * @shift: number of bits to shift the bitfield
+ * @width: width of the bitfield
+ * @clk_divider_flags: divider-specific flags for this clock
+ * @table: array of divider/value pairs ending with a div set to 0
+ * @lock: shared register lock for this clock
+ */
+#define devm_clk_hw_register_divider_table(dev, name, parent_name, flags,     \
+					   reg, shift, width,		      \
+					   clk_divider_flags, table, lock)    \
+	__devm_clk_hw_register_divider((dev), NULL, (name), (parent_name),    \
+				       NULL, NULL, (flags), (reg), (shift),   \
+				       (width), (clk_divider_flags), (table), \
+				       (lock))
 
 void clk_unregister_divider(struct clk *clk);
 void clk_hw_unregister_divider(struct clk_hw *hw);
-- 
cgit v1.2.3


From 0854bcdcdec26aecdc92c303816f349ee1fba2bc Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 8 Dec 2020 21:29:45 -0800
Subject: scsi: block: Introduce BLK_MQ_REQ_PM

Introduce the BLK_MQ_REQ_PM flag. This flag makes the request allocation
functions set RQF_PM. This is the first step towards removing
BLK_MQ_REQ_PREEMPT.

Link: https://lore.kernel.org/r/20201209052951.16136-3-bvanassche@acm.org
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Stanley Chu <stanley.chu@mediatek.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Can Guo <cang@codeaurora.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Can Guo <cang@codeaurora.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/blk-core.c       | 7 ++++---
 block/blk-mq.c         | 2 ++
 include/linux/blk-mq.h | 2 ++
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 2db8bda43b6e..10696f9fb6ac 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -424,11 +424,11 @@ EXPORT_SYMBOL(blk_cleanup_queue);
 /**
  * blk_queue_enter() - try to increase q->q_usage_counter
  * @q: request queue pointer
- * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
+ * @flags: BLK_MQ_REQ_NOWAIT, BLK_MQ_REQ_PM and/or BLK_MQ_REQ_PREEMPT
  */
 int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 {
-	const bool pm = flags & BLK_MQ_REQ_PREEMPT;
+	const bool pm = flags & (BLK_MQ_REQ_PM | BLK_MQ_REQ_PREEMPT);
 
 	while (true) {
 		bool success = false;
@@ -630,7 +630,8 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op,
 	struct request *req;
 
 	WARN_ON_ONCE(op & REQ_NOWAIT);
-	WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
+	WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM |
+			       BLK_MQ_REQ_PREEMPT));
 
 	req = blk_mq_alloc_request(q, op, flags);
 	if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1b25ec2fe9be..b5880a1fb38d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -292,6 +292,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->mq_hctx = data->hctx;
 	rq->rq_flags = 0;
 	rq->cmd_flags = data->cmd_flags;
+	if (data->flags & BLK_MQ_REQ_PM)
+		rq->rq_flags |= RQF_PM;
 	if (data->flags & BLK_MQ_REQ_PREEMPT)
 		rq->rq_flags |= RQF_PREEMPT;
 	if (blk_queue_io_stat(data->q))
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b23eeca4d677..c00e856c6fb1 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -444,6 +444,8 @@ enum {
 	BLK_MQ_REQ_NOWAIT	= (__force blk_mq_req_flags_t)(1 << 0),
 	/* allocate from reserved pool */
 	BLK_MQ_REQ_RESERVED	= (__force blk_mq_req_flags_t)(1 << 1),
+	/* set RQF_PM */
+	BLK_MQ_REQ_PM		= (__force blk_mq_req_flags_t)(1 << 2),
 	/* set RQF_PREEMPT */
 	BLK_MQ_REQ_PREEMPT	= (__force blk_mq_req_flags_t)(1 << 3),
 };
-- 
cgit v1.2.3


From a4d34da715e3cb7e0741fe603dcd511bed067e00 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 8 Dec 2020 21:29:50 -0800
Subject: scsi: block: Remove RQF_PREEMPT and BLK_MQ_REQ_PREEMPT

Remove flag RQF_PREEMPT and BLK_MQ_REQ_PREEMPT since these are no longer
used by any kernel code.

Link: https://lore.kernel.org/r/20201209052951.16136-8-bvanassche@acm.org
Cc: Can Guo <cang@codeaurora.org>
Cc: Stanley Chu <stanley.chu@mediatek.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Martin Kepplinger <martin.kepplinger@puri.sm>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Can Guo <cang@codeaurora.org>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/blk-core.c       | 7 +++----
 block/blk-mq-debugfs.c | 1 -
 block/blk-mq.c         | 2 --
 include/linux/blk-mq.h | 2 --
 include/linux/blkdev.h | 6 +-----
 5 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 10696f9fb6ac..a00bce9f46d8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -424,11 +424,11 @@ EXPORT_SYMBOL(blk_cleanup_queue);
 /**
  * blk_queue_enter() - try to increase q->q_usage_counter
  * @q: request queue pointer
- * @flags: BLK_MQ_REQ_NOWAIT, BLK_MQ_REQ_PM and/or BLK_MQ_REQ_PREEMPT
+ * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
  */
 int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 {
-	const bool pm = flags & (BLK_MQ_REQ_PM | BLK_MQ_REQ_PREEMPT);
+	const bool pm = flags & BLK_MQ_REQ_PM;
 
 	while (true) {
 		bool success = false;
@@ -630,8 +630,7 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op,
 	struct request *req;
 
 	WARN_ON_ONCE(op & REQ_NOWAIT);
-	WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM |
-			       BLK_MQ_REQ_PREEMPT));
+	WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM));
 
 	req = blk_mq_alloc_request(q, op, flags);
 	if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3094542e12ae..9336a6f8d6ef 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -297,7 +297,6 @@ static const char *const rqf_name[] = {
 	RQF_NAME(MIXED_MERGE),
 	RQF_NAME(MQ_INFLIGHT),
 	RQF_NAME(DONTPREP),
-	RQF_NAME(PREEMPT),
 	RQF_NAME(FAILED),
 	RQF_NAME(QUIET),
 	RQF_NAME(ELVPRIV),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b5880a1fb38d..d50504888b68 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -294,8 +294,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->cmd_flags = data->cmd_flags;
 	if (data->flags & BLK_MQ_REQ_PM)
 		rq->rq_flags |= RQF_PM;
-	if (data->flags & BLK_MQ_REQ_PREEMPT)
-		rq->rq_flags |= RQF_PREEMPT;
 	if (blk_queue_io_stat(data->q))
 		rq->rq_flags |= RQF_IO_STAT;
 	INIT_LIST_HEAD(&rq->queuelist);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c00e856c6fb1..88af1df94308 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -446,8 +446,6 @@ enum {
 	BLK_MQ_REQ_RESERVED	= (__force blk_mq_req_flags_t)(1 << 1),
 	/* set RQF_PM */
 	BLK_MQ_REQ_PM		= (__force blk_mq_req_flags_t)(1 << 2),
-	/* set RQF_PREEMPT */
-	BLK_MQ_REQ_PREEMPT	= (__force blk_mq_req_flags_t)(1 << 3),
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 639cae2c158b..7d4b746f7e6a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -79,9 +79,6 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_MQ_INFLIGHT		((__force req_flags_t)(1 << 6))
 /* don't call prep for this one */
 #define RQF_DONTPREP		((__force req_flags_t)(1 << 7))
-/* set for "ide_preempt" requests and also for requests for which the SCSI
-   "quiesce" state must be ignored. */
-#define RQF_PREEMPT		((__force req_flags_t)(1 << 8))
 /* vaguely specified driver internal error.  Ignored by the block layer */
 #define RQF_FAILED		((__force req_flags_t)(1 << 10))
 /* don't warn about errors */
@@ -430,8 +427,7 @@ struct request_queue {
 	unsigned long		queue_flags;
 	/*
 	 * Number of contexts that have called blk_set_pm_only(). If this
-	 * counter is above zero then only RQF_PM and RQF_PREEMPT requests are
-	 * processed.
+	 * counter is above zero then only RQF_PM requests are processed.
 	 */
 	atomic_t		pm_only;
 
-- 
cgit v1.2.3


From 52abca64fd9410ea6c9a3a74eab25663b403d7da Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Tue, 8 Dec 2020 21:29:51 -0800
Subject: scsi: block: Do not accept any requests while suspended

blk_queue_enter() accepts BLK_MQ_REQ_PM requests independent of the runtime
power management state. Now that SCSI domain validation no longer depends
on this behavior, modify the behavior of blk_queue_enter() as follows:

   - Do not accept any requests while suspended.

   - Only process power management requests while suspending or resuming.

Submitting BLK_MQ_REQ_PM requests to a device that is runtime suspended
causes runtime-suspended devices not to resume as they should. The request
which should cause a runtime resume instead gets issued directly, without
resuming the device first. Of course the device can't handle it properly,
the I/O fails, and the device remains suspended.

The problem is fixed by checking that the queue's runtime-PM status isn't
RPM_SUSPENDED before allowing a request to be issued, and queuing a
runtime-resume request if it is.  In particular, the inline
blk_pm_request_resume() routine is renamed blk_pm_resume_queue() and the
code is unified by merging the surrounding checks into the routine.  If the
queue isn't set up for runtime PM, or there currently is no restriction on
allowed requests, the request is allowed.  Likewise if the BLK_MQ_REQ_PM
flag is set and the status isn't RPM_SUSPENDED.  Otherwise a runtime resume
is queued and the request is blocked until conditions are more suitable.

[ bvanassche: modified commit message and removed Cc: stable because
  without the previous patches from this series this patch would break
  parallel SCSI domain validation + introduced queue_rpm_status() ]

Link: https://lore.kernel.org/r/20201209052951.16136-9-bvanassche@acm.org
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Can Guo <cang@codeaurora.org>
Cc: Stanley Chu <stanley.chu@mediatek.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reported-and-tested-by: Martin Kepplinger <martin.kepplinger@puri.sm>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Can Guo <cang@codeaurora.org>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/blk-core.c       |  7 ++++---
 block/blk-pm.h         | 14 +++++++++-----
 include/linux/blkdev.h | 12 ++++++++++++
 3 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index a00bce9f46d8..2d53e2ff48ff 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
+#include <linux/blk-pm.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
@@ -440,7 +441,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 			 * responsible for ensuring that that counter is
 			 * globally visible before the queue is unfrozen.
 			 */
-			if (pm || !blk_queue_pm_only(q)) {
+			if ((pm && queue_rpm_status(q) != RPM_SUSPENDED) ||
+			    !blk_queue_pm_only(q)) {
 				success = true;
 			} else {
 				percpu_ref_put(&q->q_usage_counter);
@@ -465,8 +467,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 
 		wait_event(q->mq_freeze_wq,
 			   (!q->mq_freeze_depth &&
-			    (pm || (blk_pm_request_resume(q),
-				    !blk_queue_pm_only(q)))) ||
+			    blk_pm_resume_queue(pm, q)) ||
 			   blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
diff --git a/block/blk-pm.h b/block/blk-pm.h
index ea5507d23e75..a2283cc9f716 100644
--- a/block/blk-pm.h
+++ b/block/blk-pm.h
@@ -6,11 +6,14 @@
 #include <linux/pm_runtime.h>
 
 #ifdef CONFIG_PM
-static inline void blk_pm_request_resume(struct request_queue *q)
+static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
 {
-	if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
-		       q->rpm_status == RPM_SUSPENDING))
-		pm_request_resume(q->dev);
+	if (!q->dev || !blk_queue_pm_only(q))
+		return 1;	/* Nothing to do */
+	if (pm && q->rpm_status != RPM_SUSPENDED)
+		return 1;	/* Request allowed */
+	pm_request_resume(q->dev);
+	return 0;
 }
 
 static inline void blk_pm_mark_last_busy(struct request *rq)
@@ -44,8 +47,9 @@ static inline void blk_pm_put_request(struct request *rq)
 		--rq->q->nr_pending;
 }
 #else
-static inline void blk_pm_request_resume(struct request_queue *q)
+static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
 {
+	return 1;
 }
 
 static inline void blk_pm_mark_last_busy(struct request *rq)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7d4b746f7e6a..2b6fc3fb3a99 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -692,6 +692,18 @@ static inline bool queue_is_mq(struct request_queue *q)
 	return q->mq_ops;
 }
 
+#ifdef CONFIG_PM
+static inline enum rpm_status queue_rpm_status(struct request_queue *q)
+{
+	return q->rpm_status;
+}
+#else
+static inline enum rpm_status queue_rpm_status(struct request_queue *q)
+{
+	return RPM_ACTIVE;
+}
+#endif
+
 static inline enum blk_zoned_model
 blk_queue_zoned_model(struct request_queue *q)
 {
-- 
cgit v1.2.3


From 30b79eb1f92ed5974885d374a4107c94e2dd3e03 Mon Sep 17 00:00:00 2001
From: Michael Tretter <m.tretter@pengutronix.de>
Date: Mon, 9 Nov 2020 14:48:17 +0100
Subject: soc: xilinx: vcu: use vcu-settings syscon registers

Switch the "logicoreip" registers to the new xlnx,vcu-settings binding
to be able to read the settings if the settings are specified in a
separate device tree node that is shared with other drivers.

If the driver is not able to find a node with the new binding, fall back
to check for the logicore register bank to be backwards compatible.

Signed-off-by: Michael Tretter <m.tretter@pengutronix.de>
Reviewed-by: Hyun Kwon <hyun.kwon@xilinx.com>
Link: https://lore.kernel.org/r/20201109134818.4159342-4-m.tretter@pengutronix.de
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 drivers/soc/xilinx/Kconfig          |  1 +
 drivers/soc/xilinx/xlnx_vcu.c       | 94 ++++++++++++++++++-------------------
 include/linux/mfd/syscon/xlnx-vcu.h | 38 +++++++++++++++
 3 files changed, 86 insertions(+), 47 deletions(-)
 create mode 100644 include/linux/mfd/syscon/xlnx-vcu.h

(limited to 'include/linux')

diff --git a/drivers/soc/xilinx/Kconfig b/drivers/soc/xilinx/Kconfig
index 646512d7276f..0b1708dae361 100644
--- a/drivers/soc/xilinx/Kconfig
+++ b/drivers/soc/xilinx/Kconfig
@@ -4,6 +4,7 @@ menu "Xilinx SoC drivers"
 config XILINX_VCU
 	tristate "Xilinx VCU logicoreIP Init"
 	depends on HAS_IOMEM
+	select REGMAP_MMIO
 	help
 	  Provides the driver to enable and disable the isolation between the
 	  processing system and programmable logic part by using the logicoreIP
diff --git a/drivers/soc/xilinx/xlnx_vcu.c b/drivers/soc/xilinx/xlnx_vcu.c
index dcd8e7824b06..14daad4efc58 100644
--- a/drivers/soc/xilinx/xlnx_vcu.c
+++ b/drivers/soc/xilinx/xlnx_vcu.c
@@ -10,39 +10,12 @@
 #include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/io.h>
+#include <linux/mfd/syscon.h>
+#include <linux/mfd/syscon/xlnx-vcu.h>
 #include <linux/module.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
-
-/* Address map for different registers implemented in the VCU LogiCORE IP. */
-#define VCU_ECODER_ENABLE		0x00
-#define VCU_DECODER_ENABLE		0x04
-#define VCU_MEMORY_DEPTH		0x08
-#define VCU_ENC_COLOR_DEPTH		0x0c
-#define VCU_ENC_VERTICAL_RANGE		0x10
-#define VCU_ENC_FRAME_SIZE_X		0x14
-#define VCU_ENC_FRAME_SIZE_Y		0x18
-#define VCU_ENC_COLOR_FORMAT		0x1c
-#define VCU_ENC_FPS			0x20
-#define VCU_MCU_CLK			0x24
-#define VCU_CORE_CLK			0x28
-#define VCU_PLL_BYPASS			0x2c
-#define VCU_ENC_CLK			0x30
-#define VCU_PLL_CLK			0x34
-#define VCU_ENC_VIDEO_STANDARD		0x38
-#define VCU_STATUS			0x3c
-#define VCU_AXI_ENC_CLK			0x40
-#define VCU_AXI_DEC_CLK			0x44
-#define VCU_AXI_MCU_CLK			0x48
-#define VCU_DEC_VIDEO_STANDARD		0x4c
-#define VCU_DEC_FRAME_SIZE_X		0x50
-#define VCU_DEC_FRAME_SIZE_Y		0x54
-#define VCU_DEC_FPS			0x58
-#define VCU_BUFFER_B_FRAME		0x5c
-#define VCU_WPP_EN			0x60
-#define VCU_PLL_CLK_DEC			0x64
-#define VCU_GASKET_INIT			0x74
-#define VCU_GASKET_VALUE		0x03
+#include <linux/regmap.h>
 
 /* vcu slcr registers, bitmask and shift */
 #define VCU_PLL_CTRL			0x24
@@ -106,11 +79,20 @@ struct xvcu_device {
 	struct device *dev;
 	struct clk *pll_ref;
 	struct clk *aclk;
-	void __iomem *logicore_reg_ba;
+	struct regmap *logicore_reg_ba;
 	void __iomem *vcu_slcr_ba;
 	u32 coreclk;
 };
 
+static struct regmap_config vcu_settings_regmap_config = {
+	.name = "regmap",
+	.reg_bits = 32,
+	.val_bits = 32,
+	.reg_stride = 4,
+	.max_register = 0xfff,
+	.cache_type = REGCACHE_NONE,
+};
+
 /**
  * struct xvcu_pll_cfg - Helper data
  * @fbdiv: The integer portion of the feedback divider to the PLL
@@ -300,10 +282,12 @@ static int xvcu_set_vcu_pll_info(struct xvcu_device *xvcu)
 	int ret, i;
 	const struct xvcu_pll_cfg *found = NULL;
 
-	inte = xvcu_read(xvcu->logicore_reg_ba, VCU_PLL_CLK);
-	deci = xvcu_read(xvcu->logicore_reg_ba, VCU_PLL_CLK_DEC);
-	coreclk = xvcu_read(xvcu->logicore_reg_ba, VCU_CORE_CLK) * MHZ;
-	mcuclk = xvcu_read(xvcu->logicore_reg_ba, VCU_MCU_CLK) * MHZ;
+	regmap_read(xvcu->logicore_reg_ba, VCU_PLL_CLK, &inte);
+	regmap_read(xvcu->logicore_reg_ba, VCU_PLL_CLK_DEC, &deci);
+	regmap_read(xvcu->logicore_reg_ba, VCU_CORE_CLK, &coreclk);
+	coreclk *= MHZ;
+	regmap_read(xvcu->logicore_reg_ba, VCU_MCU_CLK, &mcuclk);
+	mcuclk *= MHZ;
 	if (!mcuclk || !coreclk) {
 		dev_err(xvcu->dev, "Invalid mcu and core clock data\n");
 		return -EINVAL;
@@ -498,6 +482,7 @@ static int xvcu_probe(struct platform_device *pdev)
 {
 	struct resource *res;
 	struct xvcu_device *xvcu;
+	void __iomem *regs;
 	int ret;
 
 	xvcu = devm_kzalloc(&pdev->dev, sizeof(*xvcu), GFP_KERNEL);
@@ -518,17 +503,32 @@ static int xvcu_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
-	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "logicore");
-	if (!res) {
-		dev_err(&pdev->dev, "get logicore memory resource failed.\n");
-		return -ENODEV;
-	}
+	xvcu->logicore_reg_ba =
+		syscon_regmap_lookup_by_compatible("xlnx,vcu-settings");
+	if (IS_ERR(xvcu->logicore_reg_ba)) {
+		dev_info(&pdev->dev,
+			 "could not find xlnx,vcu-settings: trying direct register access\n");
+
+		res = platform_get_resource_byname(pdev,
+						   IORESOURCE_MEM, "logicore");
+		if (!res) {
+			dev_err(&pdev->dev, "get logicore memory resource failed.\n");
+			return -ENODEV;
+		}
 
-	xvcu->logicore_reg_ba = devm_ioremap(&pdev->dev, res->start,
-						     resource_size(res));
-	if (!xvcu->logicore_reg_ba) {
-		dev_err(&pdev->dev, "logicore register mapping failed.\n");
-		return -ENOMEM;
+		regs = devm_ioremap(&pdev->dev, res->start, resource_size(res));
+		if (!regs) {
+			dev_err(&pdev->dev, "logicore register mapping failed.\n");
+			return -ENOMEM;
+		}
+
+		xvcu->logicore_reg_ba =
+			devm_regmap_init_mmio(&pdev->dev, regs,
+					      &vcu_settings_regmap_config);
+		if (IS_ERR(xvcu->logicore_reg_ba)) {
+			dev_err(&pdev->dev, "failed to init regmap\n");
+			return PTR_ERR(xvcu->logicore_reg_ba);
+		}
 	}
 
 	xvcu->aclk = devm_clk_get(&pdev->dev, "aclk");
@@ -560,7 +560,7 @@ static int xvcu_probe(struct platform_device *pdev)
 	 * Bit 0 : Gasket isolation
 	 * Bit 1 : put VCU out of reset
 	 */
-	xvcu_write(xvcu->logicore_reg_ba, VCU_GASKET_INIT, VCU_GASKET_VALUE);
+	regmap_write(xvcu->logicore_reg_ba, VCU_GASKET_INIT, VCU_GASKET_VALUE);
 
 	/* Do the PLL Settings based on the ref clk,core and mcu clk freq */
 	ret = xvcu_set_pll(xvcu);
@@ -597,7 +597,7 @@ static int xvcu_remove(struct platform_device *pdev)
 		return -ENODEV;
 
 	/* Add the the Gasket isolation and put the VCU in reset. */
-	xvcu_write(xvcu->logicore_reg_ba, VCU_GASKET_INIT, 0);
+	regmap_write(xvcu->logicore_reg_ba, VCU_GASKET_INIT, 0);
 
 	clk_disable_unprepare(xvcu->pll_ref);
 	clk_disable_unprepare(xvcu->aclk);
diff --git a/include/linux/mfd/syscon/xlnx-vcu.h b/include/linux/mfd/syscon/xlnx-vcu.h
new file mode 100644
index 000000000000..d3edec4b7b1d
--- /dev/null
+++ b/include/linux/mfd/syscon/xlnx-vcu.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 Pengutronix, Michael Tretter <kernel@pengutronix.de>
+ */
+
+#ifndef __XLNX_VCU_H
+#define __XLNX_VCU_H
+
+#define VCU_ECODER_ENABLE		0x00
+#define VCU_DECODER_ENABLE		0x04
+#define VCU_MEMORY_DEPTH		0x08
+#define VCU_ENC_COLOR_DEPTH		0x0c
+#define VCU_ENC_VERTICAL_RANGE		0x10
+#define VCU_ENC_FRAME_SIZE_X		0x14
+#define VCU_ENC_FRAME_SIZE_Y		0x18
+#define VCU_ENC_COLOR_FORMAT		0x1c
+#define VCU_ENC_FPS			0x20
+#define VCU_MCU_CLK			0x24
+#define VCU_CORE_CLK			0x28
+#define VCU_PLL_BYPASS			0x2c
+#define VCU_ENC_CLK			0x30
+#define VCU_PLL_CLK			0x34
+#define VCU_ENC_VIDEO_STANDARD		0x38
+#define VCU_STATUS			0x3c
+#define VCU_AXI_ENC_CLK			0x40
+#define VCU_AXI_DEC_CLK			0x44
+#define VCU_AXI_MCU_CLK			0x48
+#define VCU_DEC_VIDEO_STANDARD		0x4c
+#define VCU_DEC_FRAME_SIZE_X		0x50
+#define VCU_DEC_FRAME_SIZE_Y		0x54
+#define VCU_DEC_FPS			0x58
+#define VCU_BUFFER_B_FRAME		0x5c
+#define VCU_WPP_EN			0x60
+#define VCU_PLL_CLK_DEC			0x64
+#define VCU_GASKET_INIT			0x74
+#define VCU_GASKET_VALUE		0x03
+
+#endif /* __XLNX_VCU_H */
-- 
cgit v1.2.3


From 7b1c9b8441aa94a549a90fa3d42687ccbad3eade Mon Sep 17 00:00:00 2001
From: Michael Tretter <m.tretter@pengutronix.de>
Date: Mon, 9 Nov 2020 14:48:18 +0100
Subject: soc: xilinx: vcu: add missing register NUM_CORE

The H.264/H.265 Video Codec Unit v1.2 documentation describes this
register as follows:

	Number of encoders core used for the provided configuration

This is required for configuring the VCU encoder buffer.

Signed-off-by: Michael Tretter <m.tretter@pengutronix.de>
Reviewed-by: Hyun Kwon <hyun.kwon@xilinx.com>
Link: https://lore.kernel.org/r/20201109134818.4159342-5-m.tretter@pengutronix.de
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 include/linux/mfd/syscon/xlnx-vcu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/syscon/xlnx-vcu.h b/include/linux/mfd/syscon/xlnx-vcu.h
index d3edec4b7b1d..ff7bc3656f6e 100644
--- a/include/linux/mfd/syscon/xlnx-vcu.h
+++ b/include/linux/mfd/syscon/xlnx-vcu.h
@@ -32,6 +32,7 @@
 #define VCU_BUFFER_B_FRAME		0x5c
 #define VCU_WPP_EN			0x60
 #define VCU_PLL_CLK_DEC			0x64
+#define VCU_NUM_CORE			0x6c
 #define VCU_GASKET_INIT			0x74
 #define VCU_GASKET_VALUE		0x03
 
-- 
cgit v1.2.3


From 463edf5a59fd8f0fe0135101d67bfca81d1e3771 Mon Sep 17 00:00:00 2001
From: Wendy Liang <wendy.liang@xilinx.com>
Date: Tue, 24 Nov 2020 00:18:18 -0800
Subject: firmware: xlnx-zynqmp: fix compilation warning

Fix compilation warning when ZYNQMP_FIRMWARE is not defined.

include/linux/firmware/xlnx-zynqmp.h: In function
'zynqmp_pm_get_eemi_ops':
 include/linux/firmware/xlnx-zynqmp.h:363:9: error: implicit
 declaration of function 'ERR_PTR'
 [-Werror=implicit-function-declaration]
     363 |  return ERR_PTR(-ENODEV);

include/linux/firmware/xlnx-zynqmp.h:363:18: note: each undeclared
identifier is reported only once for each function it appears in
   include/linux/firmware/xlnx-zynqmp.h: In function
'zynqmp_pm_get_api_version':
   include/linux/firmware/xlnx-zynqmp.h:367:10: error: 'ENODEV'
undeclared (first use in this function)
     367 |  return -ENODEV;
         |          ^~~~~~

Signed-off-by: Wendy Liang <wendy.liang@xilinx.com>
Link: https://lore.kernel.org/r/1606205898-12642-1-git-send-email-wendy.liang@xilinx.com
Signed-off-by: Michal Simek <michal.simek@xilinx.com>
---
 include/linux/firmware/xlnx-zynqmp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 5968df82b991..f84244ea633b 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -13,6 +13,8 @@
 #ifndef __FIRMWARE_ZYNQMP_H__
 #define __FIRMWARE_ZYNQMP_H__
 
+#include <linux/err.h>
+
 #define ZYNQMP_PM_VERSION_MAJOR	1
 #define ZYNQMP_PM_VERSION_MINOR	0
 
-- 
cgit v1.2.3


From 1f6a11a01059f9c65f8461987cc0bab4c0b58338 Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Wed, 2 Dec 2020 08:38:48 +0100
Subject: firmware: xilinx: Remove additional newline

This additional newline is useless and also reported by checkpatch
--strict.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Link: https://lore.kernel.org/r/d927f3f2c97910958dd77a22828cd0bf8d89c9de.1606894725.git.michal.simek@xilinx.com
---
 include/linux/firmware/xlnx-zynqmp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index f84244ea633b..0db9005782d6 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -316,7 +316,6 @@ struct zynqmp_pm_query_data {
 	u32 arg3;
 };
 
-
 int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1,
 			u32 arg2, u32 arg3, u32 *ret_payload);
 
-- 
cgit v1.2.3


From a80cefec2c2783166727324bde724c39aa8a12df Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Wed, 2 Dec 2020 08:38:49 +0100
Subject: firmware: xilinx: Add a blank line after function declaration

Fix all these issues which are also reported by checkpatch --strict.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Link: https://lore.kernel.org/r/7b6007e05f6c01214861a37f198cd5bee62a4d3e.1606894725.git.michal.simek@xilinx.com
---
 include/linux/firmware/xlnx-zynqmp.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 0db9005782d6..0e7e72650ed3 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -363,107 +363,132 @@ static inline struct zynqmp_eemi_ops *zynqmp_pm_get_eemi_ops(void)
 {
 	return ERR_PTR(-ENODEV);
 }
+
 static inline int zynqmp_pm_get_api_version(u32 *version)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_get_chipid(u32 *idcode, u32 *version)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata,
 				       u32 *out)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_enable(u32 clock_id)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_disable(u32 clock_id)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_getstate(u32 clock_id, u32 *state)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_setdivider(u32 clock_id, u32 divider)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_getdivider(u32 clock_id, u32 *divider)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_setrate(u32 clock_id, u64 rate)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_getrate(u32 clock_id, u64 *rate)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_setparent(u32 clock_id, u32 parent_id)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_clock_getparent(u32 clock_id, u32 *parent_id)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_set_pll_frac_mode(u32 clk_id, u32 mode)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_get_pll_frac_mode(u32 clk_id, u32 *mode)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_set_pll_frac_data(u32 clk_id, u32 data)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_get_pll_frac_data(u32 clk_id, u32 *data)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_set_sd_tapdelay(u32 node_id, u32 type, u32 value)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset,
 			   const enum zynqmp_pm_reset_action assert_flag)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_reset_get_status(const enum zynqmp_pm_reset reset,
 					     u32 *status)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_init_finalize(void)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_set_suspend_mode(u32 mode)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_request_node(const u32 node, const u32 capabilities,
 					 const u32 qos,
 					 const enum zynqmp_pm_request_ack ack)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_release_node(const u32 node)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_set_requirement(const u32 node,
 					const u32 capabilities,
 					const u32 qos,
@@ -471,39 +496,48 @@ static inline int zynqmp_pm_set_requirement(const u32 node,
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_aes_engine(const u64 address, u32 *out)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_fpga_load(const u64 address, const u32 size,
 				      const u32 flags)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_fpga_get_status(u32 *value)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_write_ggs(u32 index, u32 value)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_read_ggs(u32 index, u32 *value)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_write_pggs(u32 index, u32 value)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_read_pggs(u32 index, u32 *value)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_system_shutdown(const u32 type, const u32 subtype)
 {
 	return -ENODEV;
 }
+
 static inline int zynqmp_pm_set_boot_health_status(u32 value)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 311c2520de21cb2f44291ad3d984b42191126628 Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Wed, 2 Dec 2020 08:38:50 +0100
Subject: firmware: xilinx: Properly align function parameter

Fix parameters alignment reported by checkpatch --strict.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Link: https://lore.kernel.org/r/00ed9fcb94a6c22eff1fe8afdea46b2764a8687d.1606894725.git.michal.simek@xilinx.com
---
 include/linux/firmware/xlnx-zynqmp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h
index 0e7e72650ed3..edc2977b26d9 100644
--- a/include/linux/firmware/xlnx-zynqmp.h
+++ b/include/linux/firmware/xlnx-zynqmp.h
@@ -456,7 +456,7 @@ static inline int zynqmp_pm_sd_dll_reset(u32 node_id, u32 type)
 }
 
 static inline int zynqmp_pm_reset_assert(const enum zynqmp_pm_reset reset,
-			   const enum zynqmp_pm_reset_action assert_flag)
+					 const enum zynqmp_pm_reset_action assert_flag)
 {
 	return -ENODEV;
 }
@@ -490,9 +490,9 @@ static inline int zynqmp_pm_release_node(const u32 node)
 }
 
 static inline int zynqmp_pm_set_requirement(const u32 node,
-					const u32 capabilities,
-					const u32 qos,
-					const enum zynqmp_pm_request_ack ack)
+					    const u32 capabilities,
+					    const u32 qos,
+					    const enum zynqmp_pm_request_ack ack)
 {
 	return -ENODEV;
 }
-- 
cgit v1.2.3


From 5a6338cce9f4133c478d3b10b300f96dd644379a Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 17 Nov 2020 15:32:06 +0530
Subject: mailbox: arm_mhuv2: Add driver

This adds driver for the ARM MHUv2 (Message Handling Unit) mailbox
controller.

This is based on the accepted DT bindings of the controller and supports
combination of both transport protocols, i.e. doorbell and data-transfer.

Transmitting and receiving data through the mailbox framework is done
through struct arm_mhuv2_mbox_msg.

Based on the initial work done by Morten Borup Petersen from ARM.

Co-developed-by: Tushar Khandelwal <tushar.khandelwal@arm.com>
Signed-off-by: Tushar Khandelwal <tushar.khandelwal@arm.com>
Tested-by: Usama Arif <usama.arif@arm.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 MAINTAINERS                               |    9 +
 drivers/mailbox/Kconfig                   |    7 +
 drivers/mailbox/Makefile                  |    2 +
 drivers/mailbox/arm_mhuv2.c               | 1136 +++++++++++++++++++++++++++++
 include/linux/mailbox/arm_mhuv2_message.h |   20 +
 5 files changed, 1174 insertions(+)
 create mode 100644 drivers/mailbox/arm_mhuv2.c
 create mode 100644 include/linux/mailbox/arm_mhuv2_message.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 2daa6ee673f7..3917b7ef1da6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10418,6 +10418,15 @@ F:	drivers/mailbox/
 F:	include/linux/mailbox_client.h
 F:	include/linux/mailbox_controller.h
 
+MAILBOX ARM MHUv2
+M:	Viresh Kumar <viresh.kumar@linaro.org>
+M:	Tushar Khandelwal <Tushar.Khandelwal@arm.com>
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+F:	drivers/mailbox/arm_mhuv2.c
+F:	include/linux/mailbox/arm_mhuv2_message.h
+F:	Documentation/devicetree/bindings/mailbox/arm,mhuv2.yaml
+
 MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7
 M:	Michael Kerrisk <mtk.manpages@gmail.com>
 L:	linux-man@vger.kernel.org
diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index abbf5d67ffa2..f4abe3529acd 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -16,6 +16,13 @@ config ARM_MHU
 	  The controller has 3 mailbox channels, the last of which can be
 	  used in Secure mode only.
 
+config ARM_MHU_V2
+	tristate "ARM MHUv2 Mailbox"
+	depends on ARM_AMBA
+	help
+	  Say Y here if you want to build the ARM MHUv2 controller driver,
+	  which provides unidirectional mailboxes between processing elements.
+
 config IMX_MBOX
 	tristate "i.MX Mailbox"
 	depends on ARCH_MXC || COMPILE_TEST
diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile
index 2e06e02b2e03..7194fa92c787 100644
--- a/drivers/mailbox/Makefile
+++ b/drivers/mailbox/Makefile
@@ -7,6 +7,8 @@ obj-$(CONFIG_MAILBOX_TEST)	+= mailbox-test.o
 
 obj-$(CONFIG_ARM_MHU)	+= arm_mhu.o arm_mhu_db.o
 
+obj-$(CONFIG_ARM_MHU_V2)	+= arm_mhuv2.o
+
 obj-$(CONFIG_IMX_MBOX)	+= imx-mailbox.o
 
 obj-$(CONFIG_ARMADA_37XX_RWTM_MBOX)	+= armada-37xx-rwtm-mailbox.o
diff --git a/drivers/mailbox/arm_mhuv2.c b/drivers/mailbox/arm_mhuv2.c
new file mode 100644
index 000000000000..67fb10885bb4
--- /dev/null
+++ b/drivers/mailbox/arm_mhuv2.c
@@ -0,0 +1,1136 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM Message Handling Unit Version 2 (MHUv2) driver.
+ *
+ * Copyright (C) 2020 ARM Ltd.
+ * Copyright (C) 2020 Linaro Ltd.
+ *
+ * An MHUv2 mailbox controller can provide up to 124 channel windows (each 32
+ * bit long) and the driver allows any combination of both the transport
+ * protocol modes: data-transfer and doorbell, to be used on those channel
+ * windows.
+ *
+ * The transport protocols should be specified in the device tree entry for the
+ * device. The transport protocols determine how the underlying hardware
+ * resources of the device are utilized when transmitting data. Refer to the
+ * device tree bindings of the ARM MHUv2 controller for more details.
+ *
+ * The number of registered mailbox channels is dependent on both the underlying
+ * hardware - mainly the number of channel windows implemented by the platform,
+ * as well as the selected transport protocols.
+ *
+ * The MHUv2 controller can work both as a sender and receiver, but the driver
+ * and the DT bindings support unidirectional transfers for better allocation of
+ * the channels. That is, this driver will be probed for two separate devices
+ * for each mailbox controller, a sender device and a receiver device.
+ */
+
+#include <linux/amba/bus.h>
+#include <linux/interrupt.h>
+#include <linux/mailbox_controller.h>
+#include <linux/mailbox/arm_mhuv2_message.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/spinlock.h>
+
+/* ====== MHUv2 Registers ====== */
+
+/* Maximum number of channel windows */
+#define MHUV2_CH_WN_MAX			124
+/* Number of combined interrupt status registers */
+#define MHUV2_CMB_INT_ST_REG_CNT	4
+#define MHUV2_STAT_BYTES		(sizeof(u32))
+#define MHUV2_STAT_BITS			(MHUV2_STAT_BYTES * __CHAR_BIT__)
+
+#define LSB_MASK(n)			((1 << (n * __CHAR_BIT__)) - 1)
+#define MHUV2_PROTOCOL_PROP		"arm,mhuv2-protocols"
+
+/* Register Message Handling Unit Configuration fields */
+struct mhu_cfg_t {
+	u32 num_ch : 7;
+	u32 pad : 25;
+} __packed;
+
+/* register Interrupt Status fields */
+struct int_st_t {
+	u32 nr2r : 1;
+	u32 r2nr : 1;
+	u32 pad : 30;
+} __packed;
+
+/* Register Interrupt Clear fields */
+struct int_clr_t {
+	u32 nr2r : 1;
+	u32 r2nr : 1;
+	u32 pad : 30;
+} __packed;
+
+/* Register Interrupt Enable fields */
+struct int_en_t {
+	u32 r2nr : 1;
+	u32 nr2r : 1;
+	u32 chcomb : 1;
+	u32 pad : 29;
+} __packed;
+
+/* Register Implementer Identification fields */
+struct iidr_t {
+	u32 implementer : 12;
+	u32 revision : 4;
+	u32 variant : 4;
+	u32 product_id : 12;
+} __packed;
+
+/* Register Architecture Identification Register fields */
+struct aidr_t {
+	u32 arch_minor_rev : 4;
+	u32 arch_major_rev : 4;
+	u32 pad : 24;
+} __packed;
+
+/* Sender Channel Window fields */
+struct mhu2_send_ch_wn_reg {
+	u32 stat;
+	u8 pad1[0x0C - 0x04];
+	u32 stat_set;
+	u32 int_st;
+	u32 int_clr;
+	u32 int_en;
+	u8 pad2[0x20 - 0x1C];
+} __packed;
+
+/* Sender frame register fields */
+struct mhu2_send_frame_reg {
+	struct mhu2_send_ch_wn_reg ch_wn[MHUV2_CH_WN_MAX];
+	struct mhu_cfg_t mhu_cfg;
+	u32 resp_cfg;
+	u32 access_request;
+	u32 access_ready;
+	struct int_st_t int_st;
+	struct int_clr_t int_clr;
+	struct int_en_t int_en;
+	u32 reserved0;
+	u32 chcomb_int_st[MHUV2_CMB_INT_ST_REG_CNT];
+	u8 pad[0xFC8 - 0xFB0];
+	struct iidr_t iidr;
+	struct aidr_t aidr;
+} __packed;
+
+/* Receiver Channel Window fields */
+struct mhu2_recv_ch_wn_reg {
+	u32 stat;
+	u32 stat_masked;
+	u32 stat_clear;
+	u8 reserved0[0x10 - 0x0C];
+	u32 mask;
+	u32 mask_set;
+	u32 mask_clear;
+	u8 pad[0x20 - 0x1C];
+} __packed;
+
+/* Receiver frame register fields */
+struct mhu2_recv_frame_reg {
+	struct mhu2_recv_ch_wn_reg ch_wn[MHUV2_CH_WN_MAX];
+	struct mhu_cfg_t mhu_cfg;
+	u8 reserved0[0xF90 - 0xF84];
+	struct int_st_t int_st;
+	struct int_clr_t int_clr;
+	struct int_en_t int_en;
+	u32 pad;
+	u32 chcomb_int_st[MHUV2_CMB_INT_ST_REG_CNT];
+	u8 reserved2[0xFC8 - 0xFB0];
+	struct iidr_t iidr;
+	struct aidr_t aidr;
+} __packed;
+
+
+/* ====== MHUv2 data structures ====== */
+
+enum mhuv2_transport_protocol {
+	DOORBELL = 0,
+	DATA_TRANSFER = 1
+};
+
+enum mhuv2_frame {
+	RECEIVER_FRAME,
+	SENDER_FRAME
+};
+
+/**
+ * struct mhuv2 - MHUv2 mailbox controller data
+ *
+ * @mbox:	Mailbox controller belonging to the MHU frame.
+ * @send/recv:	Base address of the register mapping region.
+ * @frame:	Frame type: RECEIVER_FRAME or SENDER_FRAME.
+ * @irq:	Interrupt.
+ * @windows:	Channel windows implemented by the platform.
+ * @minor:	Minor version of the controller.
+ * @length:	Length of the protocols array in bytes.
+ * @protocols:	Raw protocol information, derived from device tree.
+ * @doorbell_pending_lock: spinlock required for correct operation of Tx
+ *		interrupt for doorbells.
+ */
+struct mhuv2 {
+	struct mbox_controller mbox;
+	union {
+		struct mhu2_send_frame_reg __iomem *send;
+		struct mhu2_recv_frame_reg __iomem *recv;
+	};
+	enum mhuv2_frame frame;
+	unsigned int irq;
+	unsigned int windows;
+	unsigned int minor;
+	unsigned int length;
+	u32 *protocols;
+
+	spinlock_t doorbell_pending_lock;
+};
+
+#define mhu_from_mbox(_mbox) container_of(_mbox, struct mhuv2, mbox)
+
+/**
+ * struct mhuv2_protocol_ops - MHUv2 operations
+ *
+ * Each transport protocol must provide an implementation of the operations
+ * provided here.
+ *
+ * @rx_startup: Startup callback for receiver.
+ * @rx_shutdown: Shutdown callback for receiver.
+ * @read_data: Reads and clears newly available data.
+ * @tx_startup: Startup callback for receiver.
+ * @tx_shutdown: Shutdown callback for receiver.
+ * @last_tx_done: Report back if the last tx is completed or not.
+ * @send_data: Send data to the receiver.
+ */
+struct mhuv2_protocol_ops {
+	int (*rx_startup)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	void (*rx_shutdown)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	void *(*read_data)(struct mhuv2 *mhu, struct mbox_chan *chan);
+
+	void (*tx_startup)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	void (*tx_shutdown)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	int (*last_tx_done)(struct mhuv2 *mhu, struct mbox_chan *chan);
+	int (*send_data)(struct mhuv2 *mhu, struct mbox_chan *chan, void *arg);
+};
+
+/*
+ * MHUv2 mailbox channel's private information
+ *
+ * @ops:	protocol specific ops for the channel.
+ * @ch_wn_idx:	Channel window index allocated to the channel.
+ * @windows:	Total number of windows consumed by the channel, only relevant
+ *		in DATA_TRANSFER protocol.
+ * @doorbell:	Doorbell bit number within the ch_wn_idx window, only relevant
+ *		in DOORBELL protocol.
+ * @pending:	Flag indicating pending doorbell interrupt, only relevant in
+ *		DOORBELL protocol.
+ */
+struct mhuv2_mbox_chan_priv {
+	const struct mhuv2_protocol_ops *ops;
+	u32 ch_wn_idx;
+	union {
+		u32 windows;
+		struct {
+			u32 doorbell;
+			u32 pending;
+		};
+	};
+};
+
+/* Macro for reading a bitfield within a physically mapped packed struct */
+#define readl_relaxed_bitfield(_regptr, _field)				\
+	({								\
+		u32 _regval;						\
+		_regval = readl_relaxed((_regptr));			\
+		(*(typeof((_regptr)))(&_regval))._field;		\
+	})
+
+/* Macro for writing a bitfield within a physically mapped packed struct */
+#define writel_relaxed_bitfield(_value, _regptr, _field)		\
+	({								\
+		u32 _regval;						\
+		_regval = readl_relaxed(_regptr);			\
+		(*(typeof(_regptr))(&_regval))._field = _value;		\
+		writel_relaxed(_regval, _regptr);			\
+	})
+
+
+/* =================== Doorbell transport protocol operations =============== */
+
+static int mhuv2_doorbell_rx_startup(struct mhuv2 *mhu, struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	writel_relaxed(BIT(priv->doorbell),
+		       &mhu->recv->ch_wn[priv->ch_wn_idx].mask_clear);
+	return 0;
+}
+
+static void mhuv2_doorbell_rx_shutdown(struct mhuv2 *mhu,
+				       struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	writel_relaxed(BIT(priv->doorbell),
+		       &mhu->recv->ch_wn[priv->ch_wn_idx].mask_set);
+}
+
+static void *mhuv2_doorbell_read_data(struct mhuv2 *mhu, struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	writel_relaxed(BIT(priv->doorbell),
+		       &mhu->recv->ch_wn[priv->ch_wn_idx].stat_clear);
+	return NULL;
+}
+
+static int mhuv2_doorbell_last_tx_done(struct mhuv2 *mhu,
+				       struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	return !(readl_relaxed(&mhu->send->ch_wn[priv->ch_wn_idx].stat) &
+		 BIT(priv->doorbell));
+}
+
+static int mhuv2_doorbell_send_data(struct mhuv2 *mhu, struct mbox_chan *chan,
+				    void *arg)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mhu->doorbell_pending_lock, flags);
+
+	priv->pending = 1;
+	writel_relaxed(BIT(priv->doorbell),
+		       &mhu->send->ch_wn[priv->ch_wn_idx].stat_set);
+
+	spin_unlock_irqrestore(&mhu->doorbell_pending_lock, flags);
+
+	return 0;
+}
+
+static const struct mhuv2_protocol_ops mhuv2_doorbell_ops = {
+	.rx_startup = mhuv2_doorbell_rx_startup,
+	.rx_shutdown = mhuv2_doorbell_rx_shutdown,
+	.read_data = mhuv2_doorbell_read_data,
+	.last_tx_done = mhuv2_doorbell_last_tx_done,
+	.send_data = mhuv2_doorbell_send_data,
+};
+#define IS_PROTOCOL_DOORBELL(_priv) (_priv->ops == &mhuv2_doorbell_ops)
+
+/* ============= Data transfer transport protocol operations ================ */
+
+static int mhuv2_data_transfer_rx_startup(struct mhuv2 *mhu,
+					  struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	/*
+	 * The protocol mandates that all but the last status register must be
+	 * masked.
+	 */
+	writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[i].mask_clear);
+	return 0;
+}
+
+static void mhuv2_data_transfer_rx_shutdown(struct mhuv2 *mhu,
+					    struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[i].mask_set);
+}
+
+static void *mhuv2_data_transfer_read_data(struct mhuv2 *mhu,
+					   struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	const int windows = priv->windows;
+	struct arm_mhuv2_mbox_msg *msg;
+	u32 *data;
+	int i, idx;
+
+	msg = kzalloc(sizeof(*msg) + windows * MHUV2_STAT_BYTES, GFP_KERNEL);
+	if (!msg)
+		return ERR_PTR(-ENOMEM);
+
+	data = msg->data = msg + 1;
+	msg->len = windows * MHUV2_STAT_BYTES;
+
+	/*
+	 * Messages are expected in order of most significant word to least
+	 * significant word. Refer mhuv2_data_transfer_send_data() for more
+	 * details.
+	 *
+	 * We also need to read the stat register instead of stat_masked, as we
+	 * masked all but the last window.
+	 *
+	 * Last channel window must be cleared as the final operation. Upon
+	 * clearing the last channel window register, which is unmasked in
+	 * data-transfer protocol, the interrupt is de-asserted.
+	 */
+	for (i = 0; i < windows; i++) {
+		idx = priv->ch_wn_idx + i;
+		data[windows - 1 - i] = readl_relaxed(&mhu->recv->ch_wn[idx].stat);
+		writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[idx].stat_clear);
+	}
+
+	return msg;
+}
+
+static void mhuv2_data_transfer_tx_startup(struct mhuv2 *mhu,
+					   struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	/* Enable interrupts only for the last window */
+	if (mhu->minor) {
+		writel_relaxed(0x1, &mhu->send->ch_wn[i].int_clr);
+		writel_relaxed(0x1, &mhu->send->ch_wn[i].int_en);
+	}
+}
+
+static void mhuv2_data_transfer_tx_shutdown(struct mhuv2 *mhu,
+					    struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	if (mhu->minor)
+		writel_relaxed(0x0, &mhu->send->ch_wn[i].int_en);
+}
+
+static int mhuv2_data_transfer_last_tx_done(struct mhuv2 *mhu,
+					    struct mbox_chan *chan)
+{
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int i = priv->ch_wn_idx + priv->windows - 1;
+
+	/* Just checking the last channel window should be enough */
+	return !readl_relaxed(&mhu->send->ch_wn[i].stat);
+}
+
+/*
+ * Message will be transmitted from most significant to least significant word.
+ * This is to allow for messages shorter than channel windows to still trigger
+ * the receiver interrupt which gets activated when the last stat register is
+ * written. As an example, a 6-word message is to be written on a 4-channel MHU
+ * connection: Registers marked with '*' are masked, and will not generate an
+ * interrupt on the receiver side once written.
+ *
+ * u32 *data =	[0x00000001], [0x00000002], [0x00000003], [0x00000004],
+ *		[0x00000005], [0x00000006]
+ *
+ * ROUND 1:
+ * stat reg		To write	Write sequence
+ * [ stat 3 ]	<-	[0x00000001]	4 <- triggers interrupt on receiver
+ * [ stat 2 ]	<-	[0x00000002]	3
+ * [ stat 1 ]	<-	[0x00000003]	2
+ * [ stat 0 ]	<-	[0x00000004]	1
+ *
+ * data += 4 // Increment data pointer by number of stat regs
+ *
+ * ROUND 2:
+ * stat reg		To write	Write sequence
+ * [ stat 3 ]	<-	[0x00000005]	2 <- triggers interrupt on receiver
+ * [ stat 2 ]	<-	[0x00000006]	1
+ * [ stat 1 ]	<-	[0x00000000]
+ * [ stat 0 ]	<-	[0x00000000]
+ */
+static int mhuv2_data_transfer_send_data(struct mhuv2 *mhu,
+					 struct mbox_chan *chan, void *arg)
+{
+	const struct arm_mhuv2_mbox_msg *msg = arg;
+	int bytes_left = msg->len, bytes_to_send, bytes_in_round, i;
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+	int windows = priv->windows;
+	u32 *data = msg->data, word;
+
+	while (bytes_left) {
+		if (!data[0]) {
+			dev_err(mhu->mbox.dev, "Data aligned at first window can't be zero to guarantee interrupt generation at receiver");
+			return -EINVAL;
+		}
+
+		while(!mhuv2_data_transfer_last_tx_done(mhu, chan))
+			continue;
+
+		bytes_in_round = min(bytes_left, (int)(windows * MHUV2_STAT_BYTES));
+
+		for (i = windows - 1; i >= 0; i--) {
+			/* Data less than windows can transfer ? */
+			if (unlikely(bytes_in_round <= i * MHUV2_STAT_BYTES))
+				continue;
+
+			word = data[i];
+			bytes_to_send = bytes_in_round & (MHUV2_STAT_BYTES - 1);
+			if (unlikely(bytes_to_send))
+				word &= LSB_MASK(bytes_to_send);
+			else
+				bytes_to_send = MHUV2_STAT_BYTES;
+
+			writel_relaxed(word, &mhu->send->ch_wn[priv->ch_wn_idx + windows - 1 - i].stat_set);
+			bytes_left -= bytes_to_send;
+			bytes_in_round -= bytes_to_send;
+		}
+
+		data += windows;
+	}
+
+	return 0;
+}
+
+static const struct mhuv2_protocol_ops mhuv2_data_transfer_ops = {
+	.rx_startup = mhuv2_data_transfer_rx_startup,
+	.rx_shutdown = mhuv2_data_transfer_rx_shutdown,
+	.read_data = mhuv2_data_transfer_read_data,
+	.tx_startup = mhuv2_data_transfer_tx_startup,
+	.tx_shutdown = mhuv2_data_transfer_tx_shutdown,
+	.last_tx_done = mhuv2_data_transfer_last_tx_done,
+	.send_data = mhuv2_data_transfer_send_data,
+};
+
+/* Interrupt handlers */
+
+static struct mbox_chan *get_irq_chan_comb(struct mhuv2 *mhu, u32 *reg)
+{
+	struct mbox_chan *chans = mhu->mbox.chans;
+	int channel = 0, i, offset = 0, windows, protocol, ch_wn;
+	u32 stat;
+
+	for (i = 0; i < MHUV2_CMB_INT_ST_REG_CNT; i++) {
+		stat = readl_relaxed(reg + i);
+		if (!stat)
+			continue;
+
+		ch_wn = i * MHUV2_STAT_BITS + __builtin_ctz(stat);
+
+		for (i = 0; i < mhu->length; i += 2) {
+			protocol = mhu->protocols[i];
+			windows = mhu->protocols[i + 1];
+
+			if (ch_wn >= offset + windows) {
+				if (protocol == DOORBELL)
+					channel += MHUV2_STAT_BITS * windows;
+				else
+					channel++;
+
+				offset += windows;
+				continue;
+			}
+
+			/* Return first chan of the window in doorbell mode */
+			if (protocol == DOORBELL)
+				channel += MHUV2_STAT_BITS * (ch_wn - offset);
+
+			return &chans[channel];
+		}
+	}
+
+	return ERR_PTR(-EIO);
+}
+
+static irqreturn_t mhuv2_sender_interrupt(int irq, void *data)
+{
+	struct mhuv2 *mhu = data;
+	struct device *dev = mhu->mbox.dev;
+	struct mhuv2_mbox_chan_priv *priv;
+	struct mbox_chan *chan;
+	unsigned long flags;
+	int i, found = 0;
+	u32 stat;
+
+	chan = get_irq_chan_comb(mhu, mhu->send->chcomb_int_st);
+	if (IS_ERR(chan)) {
+		dev_warn(dev, "Failed to find channel for the Tx interrupt\n");
+		return IRQ_NONE;
+	}
+	priv = chan->con_priv;
+
+	if (!IS_PROTOCOL_DOORBELL(priv)) {
+		writel_relaxed(1, &mhu->send->ch_wn[priv->ch_wn_idx + priv->windows - 1].int_clr);
+
+		if (chan->cl) {
+			mbox_chan_txdone(chan, 0);
+			return IRQ_HANDLED;
+		}
+
+		dev_warn(dev, "Tx interrupt Received on channel (%u) not currently attached to a mailbox client\n",
+			 priv->ch_wn_idx);
+		return IRQ_NONE;
+	}
+
+	/* Clear the interrupt first, so we don't miss any doorbell later */
+	writel_relaxed(1, &mhu->send->ch_wn[priv->ch_wn_idx].int_clr);
+
+	/*
+	 * In Doorbell mode, make sure no new transitions happen while the
+	 * interrupt handler is trying to find the finished doorbell tx
+	 * operations, else we may think few of the transfers were complete
+	 * before they actually were.
+	 */
+	spin_lock_irqsave(&mhu->doorbell_pending_lock, flags);
+
+	/*
+	 * In case of doorbell mode, the first channel of the window is returned
+	 * by get_irq_chan_comb(). Find all the pending channels here.
+	 */
+	stat = readl_relaxed(&mhu->send->ch_wn[priv->ch_wn_idx].stat);
+
+	for (i = 0; i < MHUV2_STAT_BITS; i++) {
+		priv = chan[i].con_priv;
+
+		/* Find cases where pending was 1, but stat's bit is cleared */
+		if (priv->pending ^ ((stat >> i) & 0x1)) {
+			BUG_ON(!priv->pending);
+
+			if (!chan->cl) {
+				dev_warn(dev, "Tx interrupt received on doorbell (%u : %u) channel not currently attached to a mailbox client\n",
+					 priv->ch_wn_idx, i);
+				continue;
+			}
+
+			mbox_chan_txdone(&chan[i], 0);
+			priv->pending = 0;
+			found++;
+		}
+	}
+
+	spin_unlock_irqrestore(&mhu->doorbell_pending_lock, flags);
+
+	if (!found) {
+		/*
+		 * We may have already processed the doorbell in the previous
+		 * iteration if the interrupt came right after we cleared it but
+		 * before we read the stat register.
+		 */
+		dev_dbg(dev, "Couldn't find the doorbell (%u) for the Tx interrupt interrupt\n",
+			priv->ch_wn_idx);
+		return IRQ_NONE;
+	}
+
+	return IRQ_HANDLED;
+}
+
+static struct mbox_chan *get_irq_chan_comb_rx(struct mhuv2 *mhu)
+{
+	struct mhuv2_mbox_chan_priv *priv;
+	struct mbox_chan *chan;
+	u32 stat;
+
+	chan = get_irq_chan_comb(mhu, mhu->recv->chcomb_int_st);
+	if (IS_ERR(chan))
+		return chan;
+
+	priv = chan->con_priv;
+	if (!IS_PROTOCOL_DOORBELL(priv))
+		return chan;
+
+	/*
+	 * In case of doorbell mode, the first channel of the window is returned
+	 * by the routine. Find the exact channel here.
+	 */
+	stat = readl_relaxed(&mhu->recv->ch_wn[priv->ch_wn_idx].stat_masked);
+	BUG_ON(!stat);
+
+	return chan + __builtin_ctz(stat);
+}
+
+static struct mbox_chan *get_irq_chan_stat_rx(struct mhuv2 *mhu)
+{
+	struct mbox_chan *chans = mhu->mbox.chans;
+	struct mhuv2_mbox_chan_priv *priv;
+	u32 stat;
+	int i = 0;
+
+	while (i < mhu->mbox.num_chans) {
+		priv = chans[i].con_priv;
+		stat = readl_relaxed(&mhu->recv->ch_wn[priv->ch_wn_idx].stat_masked);
+
+		if (stat) {
+			if (IS_PROTOCOL_DOORBELL(priv))
+				i += __builtin_ctz(stat);
+			return &chans[i];
+		}
+
+		i += IS_PROTOCOL_DOORBELL(priv) ? MHUV2_STAT_BITS : 1;
+	}
+
+	return ERR_PTR(-EIO);
+}
+
+static struct mbox_chan *get_irq_chan_rx(struct mhuv2 *mhu)
+{
+	if (!mhu->minor)
+		return get_irq_chan_stat_rx(mhu);
+
+	return get_irq_chan_comb_rx(mhu);
+}
+
+static irqreturn_t mhuv2_receiver_interrupt(int irq, void *arg)
+{
+	struct mhuv2 *mhu = arg;
+	struct mbox_chan *chan = get_irq_chan_rx(mhu);
+	struct device *dev = mhu->mbox.dev;
+	struct mhuv2_mbox_chan_priv *priv;
+	int ret = IRQ_NONE;
+	void *data;
+
+	if (IS_ERR(chan)) {
+		dev_warn(dev, "Failed to find channel for the rx interrupt\n");
+		return IRQ_NONE;
+	}
+	priv = chan->con_priv;
+
+	/* Read and clear the data first */
+	data = priv->ops->read_data(mhu, chan);
+
+	if (!chan->cl) {
+		dev_warn(dev, "Received data on channel (%u) not currently attached to a mailbox client\n",
+			 priv->ch_wn_idx);
+	} else if (IS_ERR(data)) {
+		dev_err(dev, "Failed to read data: %lu\n", PTR_ERR(data));
+	} else {
+		mbox_chan_received_data(chan, data);
+		ret = IRQ_HANDLED;
+	}
+
+	kfree(data);
+	return ret;
+}
+
+/* Sender and receiver ops */
+static bool mhuv2_sender_last_tx_done(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	return priv->ops->last_tx_done(mhu, chan);
+}
+
+static int mhuv2_sender_send_data(struct mbox_chan *chan, void *data)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	if (!priv->ops->last_tx_done(mhu, chan))
+		return -EBUSY;
+
+	return priv->ops->send_data(mhu, chan, data);
+}
+
+static int mhuv2_sender_startup(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	if (priv->ops->tx_startup)
+		priv->ops->tx_startup(mhu, chan);
+	return 0;
+}
+
+static void mhuv2_sender_shutdown(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	if (priv->ops->tx_shutdown)
+		priv->ops->tx_shutdown(mhu, chan);
+}
+
+static const struct mbox_chan_ops mhuv2_sender_ops = {
+	.send_data = mhuv2_sender_send_data,
+	.startup = mhuv2_sender_startup,
+	.shutdown = mhuv2_sender_shutdown,
+	.last_tx_done = mhuv2_sender_last_tx_done,
+};
+
+static int mhuv2_receiver_startup(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	return priv->ops->rx_startup(mhu, chan);
+}
+
+static void mhuv2_receiver_shutdown(struct mbox_chan *chan)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(chan->mbox);
+	struct mhuv2_mbox_chan_priv *priv = chan->con_priv;
+
+	priv->ops->rx_shutdown(mhu, chan);
+}
+
+static int mhuv2_receiver_send_data(struct mbox_chan *chan, void *data)
+{
+	dev_err(chan->mbox->dev,
+		"Trying to transmit on a receiver MHU frame\n");
+	return -EIO;
+}
+
+static bool mhuv2_receiver_last_tx_done(struct mbox_chan *chan)
+{
+	dev_err(chan->mbox->dev, "Trying to Tx poll on a receiver MHU frame\n");
+	return true;
+}
+
+static const struct mbox_chan_ops mhuv2_receiver_ops = {
+	.send_data = mhuv2_receiver_send_data,
+	.startup = mhuv2_receiver_startup,
+	.shutdown = mhuv2_receiver_shutdown,
+	.last_tx_done = mhuv2_receiver_last_tx_done,
+};
+
+static struct mbox_chan *mhuv2_mbox_of_xlate(struct mbox_controller *mbox,
+					     const struct of_phandle_args *pa)
+{
+	struct mhuv2 *mhu = mhu_from_mbox(mbox);
+	struct mbox_chan *chans = mbox->chans;
+	int channel = 0, i, offset, doorbell, protocol, windows;
+
+	if (pa->args_count != 2)
+		return ERR_PTR(-EINVAL);
+
+	offset = pa->args[0];
+	doorbell = pa->args[1];
+	if (doorbell >= MHUV2_STAT_BITS)
+		goto out;
+
+	for (i = 0; i < mhu->length; i += 2) {
+		protocol = mhu->protocols[i];
+		windows = mhu->protocols[i + 1];
+
+		if (protocol == DOORBELL) {
+			if (offset < windows)
+				return &chans[channel + MHUV2_STAT_BITS * offset + doorbell];
+
+			channel += MHUV2_STAT_BITS * windows;
+			offset -= windows;
+		} else {
+			if (offset == 0) {
+				if (doorbell)
+					goto out;
+
+				return &chans[channel];
+			}
+
+			channel++;
+			offset--;
+		}
+	}
+
+out:
+	dev_err(mbox->dev, "Couldn't xlate to a valid channel (%d: %d)\n",
+		pa->args[0], doorbell);
+	return ERR_PTR(-ENODEV);
+}
+
+static int mhuv2_verify_protocol(struct mhuv2 *mhu)
+{
+	struct device *dev = mhu->mbox.dev;
+	int protocol, windows, channels = 0, total_windows = 0, i;
+
+	for (i = 0; i < mhu->length; i += 2) {
+		protocol = mhu->protocols[i];
+		windows = mhu->protocols[i + 1];
+
+		if (!windows) {
+			dev_err(dev, "Window size can't be zero (%d)\n", i);
+			return -EINVAL;
+		}
+		total_windows += windows;
+
+		if (protocol == DOORBELL) {
+			channels += MHUV2_STAT_BITS * windows;
+		} else if (protocol == DATA_TRANSFER) {
+			channels++;
+		} else {
+			dev_err(dev, "Invalid protocol (%d) present in %s property at index %d\n",
+				protocol, MHUV2_PROTOCOL_PROP, i);
+			return -EINVAL;
+		}
+	}
+
+	if (total_windows > mhu->windows) {
+		dev_err(dev, "Channel windows can't be more than what's implemented by the hardware ( %d: %d)\n",
+			total_windows, mhu->windows);
+		return -EINVAL;
+	}
+
+	mhu->mbox.num_chans = channels;
+	return 0;
+}
+
+static int mhuv2_allocate_channels(struct mhuv2 *mhu)
+{
+	struct mbox_controller *mbox = &mhu->mbox;
+	struct mhuv2_mbox_chan_priv *priv;
+	struct device *dev = mbox->dev;
+	struct mbox_chan *chans;
+	int protocol, windows = 0, next_window = 0, i, j, k;
+
+	chans = devm_kcalloc(dev, mbox->num_chans, sizeof(*chans), GFP_KERNEL);
+	if (!chans)
+		return -ENOMEM;
+
+	mbox->chans = chans;
+
+	for (i = 0; i < mhu->length; i += 2) {
+		next_window += windows;
+
+		protocol = mhu->protocols[i];
+		windows = mhu->protocols[i + 1];
+
+		if (protocol == DATA_TRANSFER) {
+			priv = devm_kmalloc(dev, sizeof(*priv), GFP_KERNEL);
+			if (!priv)
+				return -ENOMEM;
+
+			priv->ch_wn_idx = next_window;
+			priv->ops = &mhuv2_data_transfer_ops;
+			priv->windows = windows;
+			chans++->con_priv = priv;
+			continue;
+		}
+
+		for (j = 0; j < windows; j++) {
+			for (k = 0; k < MHUV2_STAT_BITS; k++) {
+				priv = devm_kmalloc(dev, sizeof(*priv), GFP_KERNEL);
+				if (!priv)
+					return -ENOMEM;
+
+				priv->ch_wn_idx = next_window + j;
+				priv->ops = &mhuv2_doorbell_ops;
+				priv->doorbell = k;
+				chans++->con_priv = priv;
+			}
+
+			/*
+			 * Permanently enable interrupt as we can't
+			 * control it per doorbell.
+			 */
+			if (mhu->frame == SENDER_FRAME && mhu->minor)
+				writel_relaxed(0x1, &mhu->send->ch_wn[priv->ch_wn_idx].int_en);
+		}
+	}
+
+	/* Make sure we have initialized all channels */
+	BUG_ON(chans - mbox->chans != mbox->num_chans);
+
+	return 0;
+}
+
+static int mhuv2_parse_channels(struct mhuv2 *mhu)
+{
+	struct device *dev = mhu->mbox.dev;
+	const struct device_node *np = dev->of_node;
+	int ret, count;
+	u32 *protocols;
+
+	count = of_property_count_u32_elems(np, MHUV2_PROTOCOL_PROP);
+	if (count <= 0 || count % 2) {
+		dev_err(dev, "Invalid %s property (%d)\n", MHUV2_PROTOCOL_PROP,
+			count);
+		return -EINVAL;
+	}
+
+	protocols = devm_kmalloc_array(dev, count, sizeof(*protocols), GFP_KERNEL);
+	if (!protocols)
+		return -ENOMEM;
+
+	ret = of_property_read_u32_array(np, MHUV2_PROTOCOL_PROP, protocols, count);
+	if (ret) {
+		dev_err(dev, "Failed to read %s property: %d\n",
+			MHUV2_PROTOCOL_PROP, ret);
+		return ret;
+	}
+
+	mhu->protocols = protocols;
+	mhu->length = count;
+
+	ret = mhuv2_verify_protocol(mhu);
+	if (ret)
+		return ret;
+
+	return mhuv2_allocate_channels(mhu);
+}
+
+static int mhuv2_tx_init(struct amba_device *adev, struct mhuv2 *mhu,
+			 void __iomem *reg)
+{
+	struct device *dev = mhu->mbox.dev;
+	int ret, i;
+
+	mhu->frame = SENDER_FRAME;
+	mhu->mbox.ops = &mhuv2_sender_ops;
+	mhu->send = reg;
+
+	mhu->windows = readl_relaxed_bitfield(&mhu->send->mhu_cfg, num_ch);
+	mhu->minor = readl_relaxed_bitfield(&mhu->send->aidr, arch_minor_rev);
+
+	spin_lock_init(&mhu->doorbell_pending_lock);
+
+	/*
+	 * For minor version 1 and forward, tx interrupt is provided by
+	 * the controller.
+	 */
+	if (mhu->minor && adev->irq[0]) {
+		ret = devm_request_threaded_irq(dev, adev->irq[0], NULL,
+						mhuv2_sender_interrupt,
+						IRQF_ONESHOT, "mhuv2-tx", mhu);
+		if (ret) {
+			dev_err(dev, "Failed to request tx IRQ, fallback to polling mode: %d\n",
+				ret);
+		} else {
+			mhu->mbox.txdone_irq = true;
+			mhu->mbox.txdone_poll = false;
+			mhu->irq = adev->irq[0];
+
+			writel_relaxed_bitfield(1, &mhu->send->int_en, chcomb);
+
+			/* Disable all channel interrupts */
+			for (i = 0; i < mhu->windows; i++)
+				writel_relaxed(0x0, &mhu->send->ch_wn[i].int_en);
+
+			goto out;
+		}
+	}
+
+	mhu->mbox.txdone_irq = false;
+	mhu->mbox.txdone_poll = true;
+	mhu->mbox.txpoll_period = 1;
+
+out:
+	/* Wait for receiver to be ready */
+	writel_relaxed(0x1, &mhu->send->access_request);
+	while (!readl_relaxed(&mhu->send->access_ready))
+		continue;
+
+	return 0;
+}
+
+static int mhuv2_rx_init(struct amba_device *adev, struct mhuv2 *mhu,
+			 void __iomem *reg)
+{
+	struct device *dev = mhu->mbox.dev;
+	int ret, i;
+
+	mhu->frame = RECEIVER_FRAME;
+	mhu->mbox.ops = &mhuv2_receiver_ops;
+	mhu->recv = reg;
+
+	mhu->windows = readl_relaxed_bitfield(&mhu->recv->mhu_cfg, num_ch);
+	mhu->minor = readl_relaxed_bitfield(&mhu->recv->aidr, arch_minor_rev);
+
+	mhu->irq = adev->irq[0];
+	if (!mhu->irq) {
+		dev_err(dev, "Missing receiver IRQ\n");
+		return -EINVAL;
+	}
+
+	ret = devm_request_threaded_irq(dev, mhu->irq, NULL,
+					mhuv2_receiver_interrupt, IRQF_ONESHOT,
+					"mhuv2-rx", mhu);
+	if (ret) {
+		dev_err(dev, "Failed to request rx IRQ\n");
+		return ret;
+	}
+
+	/* Mask all the channel windows */
+	for (i = 0; i < mhu->windows; i++)
+		writel_relaxed(0xFFFFFFFF, &mhu->recv->ch_wn[i].mask_set);
+
+	if (mhu->minor)
+		writel_relaxed_bitfield(1, &mhu->recv->int_en, chcomb);
+
+	return 0;
+}
+
+static int mhuv2_probe(struct amba_device *adev, const struct amba_id *id)
+{
+	struct device *dev = &adev->dev;
+	const struct device_node *np = dev->of_node;
+	struct mhuv2 *mhu;
+	void __iomem *reg;
+	int ret = -EINVAL;
+
+	reg = devm_of_iomap(dev, dev->of_node, 0, NULL);
+	if (!reg)
+		return -ENOMEM;
+
+	mhu = devm_kzalloc(dev, sizeof(*mhu), GFP_KERNEL);
+	if (!mhu)
+		return -ENOMEM;
+
+	mhu->mbox.dev = dev;
+	mhu->mbox.of_xlate = mhuv2_mbox_of_xlate;
+
+	if (of_device_is_compatible(np, "arm,mhuv2-tx"))
+		ret = mhuv2_tx_init(adev, mhu, reg);
+	else if (of_device_is_compatible(np, "arm,mhuv2-rx"))
+		ret = mhuv2_rx_init(adev, mhu, reg);
+	else
+		dev_err(dev, "Invalid compatible property\n");
+
+	if (ret)
+		return ret;
+
+	/* Channel windows can't be 0 */
+	BUG_ON(!mhu->windows);
+
+	ret = mhuv2_parse_channels(mhu);
+	if (ret)
+		return ret;
+
+	amba_set_drvdata(adev, mhu);
+
+	ret = devm_mbox_controller_register(dev, &mhu->mbox);
+	if (ret)
+		dev_err(dev, "failed to register ARM MHUv2 driver %d\n", ret);
+
+	return ret;
+}
+
+static int mhuv2_remove(struct amba_device *adev)
+{
+	struct mhuv2 *mhu = amba_get_drvdata(adev);
+
+	if (mhu->frame == SENDER_FRAME)
+		writel_relaxed(0x0, &mhu->send->access_request);
+
+	return 0;
+}
+
+static struct amba_id mhuv2_ids[] = {
+	{
+		/* 2.0 */
+		.id = 0xbb0d1,
+		.mask = 0xfffff,
+	},
+	{
+		/* 2.1 */
+		.id = 0xbb076,
+		.mask = 0xfffff,
+	},
+	{ 0, 0 },
+};
+MODULE_DEVICE_TABLE(amba, mhuv2_ids);
+
+static struct amba_driver mhuv2_driver = {
+	.drv = {
+		.name	= "arm-mhuv2",
+	},
+	.id_table	= mhuv2_ids,
+	.probe		= mhuv2_probe,
+	.remove		= mhuv2_remove,
+};
+module_amba_driver(mhuv2_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("ARM MHUv2 Driver");
+MODULE_AUTHOR("Viresh Kumar <viresh.kumar@linaro.org>");
+MODULE_AUTHOR("Tushar Khandelwal <tushar.khandelwal@arm.com>");
diff --git a/include/linux/mailbox/arm_mhuv2_message.h b/include/linux/mailbox/arm_mhuv2_message.h
new file mode 100644
index 000000000000..821b9d96daa4
--- /dev/null
+++ b/include/linux/mailbox/arm_mhuv2_message.h
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM MHUv2 Mailbox Message
+ *
+ * Copyright (C) 2020 Arm Ltd.
+ * Copyright (C) 2020 Linaro Ltd.
+ */
+
+#ifndef _LINUX_ARM_MHUV2_MESSAGE_H_
+#define _LINUX_ARM_MHUV2_MESSAGE_H_
+
+#include <linux/types.h>
+
+/* Data structure for data-transfer protocol */
+struct arm_mhuv2_mbox_msg {
+	void *data;
+	size_t len;
+};
+
+#endif /* _LINUX_ARM_MHUV2_MESSAGE_H_ */
-- 
cgit v1.2.3


From bdfae1c9a913930eae5ea506733aa7c285e12a06 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 9 Dec 2020 09:44:44 +0800
Subject: vfio/type1: Add vfio_group_iommu_domain()

Add the API for getting the domain from a vfio group. This could be used
by the physical device drivers which rely on the vfio/mdev framework for
mediated device user level access. The typical use case like below:

	unsigned int pasid;
	struct vfio_group *vfio_group;
	struct iommu_domain *iommu_domain;
	struct device *dev = mdev_dev(mdev);
	struct device *iommu_device = mdev_get_iommu_device(dev);

	if (!iommu_device ||
	    !iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
		return -EINVAL;

	vfio_group = vfio_group_get_external_user_from_dev(dev);
	if (IS_ERR_OR_NULL(vfio_group))
		return -EFAULT;

	iommu_domain = vfio_group_iommu_domain(vfio_group);
	if (IS_ERR_OR_NULL(iommu_domain)) {
		vfio_group_put_external_user(vfio_group);
		return -EFAULT;
	}

	pasid = iommu_aux_get_pasid(iommu_domain, iommu_device);
	if (pasid < 0) {
		vfio_group_put_external_user(vfio_group);
		return -EFAULT;
	}

	/* Program device context with pasid value. */
	...

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c             | 18 ++++++++++++++++++
 drivers/vfio/vfio_iommu_type1.c | 24 ++++++++++++++++++++++++
 include/linux/vfio.h            |  4 ++++
 3 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 2151bc7f87ab..4ad8a35667a7 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -2331,6 +2331,24 @@ int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
 }
 EXPORT_SYMBOL(vfio_unregister_notifier);
 
+struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group)
+{
+	struct vfio_container *container;
+	struct vfio_iommu_driver *driver;
+
+	if (!group)
+		return ERR_PTR(-EINVAL);
+
+	container = group->container;
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->group_iommu_domain))
+		return driver->ops->group_iommu_domain(container->iommu_data,
+						       group->iommu_group);
+
+	return ERR_PTR(-ENOTTY);
+}
+EXPORT_SYMBOL_GPL(vfio_group_iommu_domain);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 67e827638995..0b4dedaa9128 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2980,6 +2980,29 @@ static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
 	return ret;
 }
 
+static struct iommu_domain *
+vfio_iommu_type1_group_iommu_domain(void *iommu_data,
+				    struct iommu_group *iommu_group)
+{
+	struct iommu_domain *domain = ERR_PTR(-ENODEV);
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_domain *d;
+
+	if (!iommu || !iommu_group)
+		return ERR_PTR(-EINVAL);
+
+	mutex_lock(&iommu->lock);
+	list_for_each_entry(d, &iommu->domain_list, next) {
+		if (find_iommu_group(d, iommu_group)) {
+			domain = d->domain;
+			break;
+		}
+	}
+	mutex_unlock(&iommu->lock);
+
+	return domain;
+}
+
 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.name			= "vfio-iommu-type1",
 	.owner			= THIS_MODULE,
@@ -2993,6 +3016,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 	.register_notifier	= vfio_iommu_type1_register_notifier,
 	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
 	.dma_rw			= vfio_iommu_type1_dma_rw,
+	.group_iommu_domain	= vfio_iommu_type1_group_iommu_domain,
 };
 
 static int __init vfio_iommu_type1_init(void)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 38d3c6a8dc7e..f45940b38a02 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -90,6 +90,8 @@ struct vfio_iommu_driver_ops {
 					       struct notifier_block *nb);
 	int		(*dma_rw)(void *iommu_data, dma_addr_t user_iova,
 				  void *data, size_t count, bool write);
+	struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
+						   struct iommu_group *group);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
@@ -126,6 +128,8 @@ extern int vfio_group_unpin_pages(struct vfio_group *group,
 extern int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
 		       void *data, size_t len, bool write);
 
+extern struct iommu_domain *vfio_group_iommu_domain(struct vfio_group *group);
+
 /* each type has independent events */
 enum vfio_notify_type {
 	VFIO_IOMMU_NOTIFY = 0,
-- 
cgit v1.2.3


From 88149082bb8ef31b289673669e080ec6a00c2e59 Mon Sep 17 00:00:00 2001
From: Hao Li <lihao2018.fnst@cn.fujitsu.com>
Date: Tue, 8 Dec 2020 10:08:43 +0800
Subject: fs: Handle I_DONTCACHE in iput_final() instead of
 generic_drop_inode()

If generic_drop_inode() returns true, it means iput_final() can evict
this inode regardless of whether it is dirty or not. If we check
I_DONTCACHE in generic_drop_inode(), any inode with this bit set will be
evicted unconditionally. This is not the desired behavior because
I_DONTCACHE only means the inode shouldn't be cached on the LRU list.
As for whether we need to evict this inode, this is what
generic_drop_inode() should do. This patch corrects the usage of
I_DONTCACHE.

This patch was proposed in [1].

[1]: https://lore.kernel.org/linux-fsdevel/20200831003407.GE12096@dread.disaster.area/

Fixes: dae2f8ed7992 ("fs: Lift XFS_IDONTCACHE to the VFS layer")
Signed-off-by: Hao Li <lihao2018.fnst@cn.fujitsu.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 4 +++-
 include/linux/fs.h | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 9d78c37b00b8..5eea9912a0b9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1627,7 +1627,9 @@ static void iput_final(struct inode *inode)
 	else
 		drop = generic_drop_inode(inode);
 
-	if (!drop && (sb->s_flags & SB_ACTIVE)) {
+	if (!drop &&
+	    !(inode->i_state & I_DONTCACHE) &&
+	    (sb->s_flags & SB_ACTIVE)) {
 		inode_add_lru(inode);
 		spin_unlock(&inode->i_lock);
 		return;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8667d0cdc71e..8bde32cf9711 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2878,8 +2878,7 @@ extern int inode_needs_sync(struct inode *inode);
 extern int generic_delete_inode(struct inode *inode);
 static inline int generic_drop_inode(struct inode *inode)
 {
-	return !inode->i_nlink || inode_unhashed(inode) ||
-		(inode->i_state & I_DONTCACHE);
+	return !inode->i_nlink || inode_unhashed(inode);
 }
 extern void d_mark_dontcache(struct inode *inode);
 
-- 
cgit v1.2.3


From fecc4559780d52d174ea05e3bf543669165389c3 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Wed, 2 Dec 2020 14:07:09 +0200
Subject: fsnotify: fix events reported to watching parent and child

fsnotify_parent() used to send two separate events to backends when a
parent inode is watching children and the child inode is also watching.
In an attempt to avoid duplicate events in fanotify, we unified the two
backend callbacks to a single callback and handled the reporting of the
two separate events for the relevant backends (inotify and dnotify).
However the handling is buggy and can result in inotify and dnotify
listeners receiving events of the type they never asked for or spurious
events.

The problem is the unified event callback with two inode marks (parent and
child) is called when any of the parent and child inodes are watched and
interested in the event, but the parent inode's mark that is interested
in the event on the child is not necessarily the one we are currently
reporting to (it could belong to a different group).

So before reporting the parent or child event flavor to backend we need
to check that the mark is really interested in that event flavor.

The semantics of INODE and CHILD marks were hard to follow and made the
logic more complicated than it should have been.  Replace it with INODE
and PARENT marks semantics to hopefully make the logic more clear.

Thanks to Hugh Dickins for spotting a bug in the earlier version of this
patch.

Fixes: 497b0c5a7c06 ("fsnotify: send event to parent and child with single callback")
CC: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20201202120713.702387-4-amir73il@gmail.com
Reported-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c    |  7 ++--
 fs/notify/fsnotify.c             | 84 ++++++++++++++++++++++++----------------
 include/linux/fsnotify_backend.h |  6 +--
 3 files changed, 57 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 9167884a61ec..1192c9953620 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -268,12 +268,11 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
 			continue;
 
 		/*
-		 * If the event is for a child and this mark is on a parent not
+		 * If the event is on a child and this mark is on a parent not
 		 * watching children, don't send it!
 		 */
-		if (event_mask & FS_EVENT_ON_CHILD &&
-		    type == FSNOTIFY_OBJ_TYPE_INODE &&
-		     !(mark->mask & FS_EVENT_ON_CHILD))
+		if (type == FSNOTIFY_OBJ_TYPE_PARENT &&
+		    !(mark->mask & FS_EVENT_ON_CHILD))
 			continue;
 
 		marks_mask |= mark->mask;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index c5c68bcbaadf..30d422b8c0fc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -152,6 +152,13 @@ static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt,
 	if (mask & FS_ISDIR)
 		return false;
 
+	/*
+	 * All events that are possible on child can also may be reported with
+	 * parent/name info to inode/sb/mount.  Otherwise, a watching parent
+	 * could result in events reported with unexpected name info to sb/mount.
+	 */
+	BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT);
+
 	/* Did either inode/sb/mount subscribe for events with parent/name? */
 	marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask);
 	marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask);
@@ -249,6 +256,10 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group,
 	    path && d_unlinked(path->dentry))
 		return 0;
 
+	/* Check interest of this mark in case event was sent with two marks */
+	if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS))
+		return 0;
+
 	return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie);
 }
 
@@ -258,38 +269,46 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
 				 u32 cookie, struct fsnotify_iter_info *iter_info)
 {
 	struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
-	struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info);
+	struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info);
 	int ret;
 
 	if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) ||
 	    WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
 		return 0;
 
-	/*
-	 * An event can be sent on child mark iterator instead of inode mark
-	 * iterator because of other groups that have interest of this inode
-	 * and have marks on both parent and child.  We can simplify this case.
-	 */
-	if (!inode_mark) {
-		inode_mark = child_mark;
-		child_mark = NULL;
+	if (parent_mark) {
+		/*
+		 * parent_mark indicates that the parent inode is watching
+		 * children and interested in this event, which is an event
+		 * possible on child. But is *this mark* watching children and
+		 * interested in this event?
+		 */
+		if (parent_mark->mask & FS_EVENT_ON_CHILD) {
+			ret = fsnotify_handle_inode_event(group, parent_mark, mask,
+							  data, data_type, dir, name, 0);
+			if (ret)
+				return ret;
+		}
+		if (!inode_mark)
+			return 0;
+	}
+
+	if (mask & FS_EVENT_ON_CHILD) {
+		/*
+		 * Some events can be sent on both parent dir and child marks
+		 * (e.g. FS_ATTRIB).  If both parent dir and child are
+		 * watching, report the event once to parent dir with name (if
+		 * interested) and once to child without name (if interested).
+		 * The child watcher is expecting an event without a file name
+		 * and without the FS_EVENT_ON_CHILD flag.
+		 */
+		mask &= ~FS_EVENT_ON_CHILD;
 		dir = NULL;
 		name = NULL;
 	}
 
-	ret = fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
-					  dir, name, cookie);
-	if (ret || !child_mark)
-		return ret;
-
-	/*
-	 * Some events can be sent on both parent dir and child marks
-	 * (e.g. FS_ATTRIB).  If both parent dir and child are watching,
-	 * report the event once to parent dir with name and once to child
-	 * without name.
-	 */
-	return fsnotify_handle_inode_event(group, child_mark, mask, data, data_type,
-					   NULL, NULL, 0);
+	return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
+					   dir, name, cookie);
 }
 
 static int send_to_group(__u32 mask, const void *data, int data_type,
@@ -447,7 +466,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 	struct fsnotify_iter_info iter_info = {};
 	struct super_block *sb;
 	struct mount *mnt = NULL;
-	struct inode *child = NULL;
+	struct inode *parent = NULL;
 	int ret = 0;
 	__u32 test_mask, marks_mask;
 
@@ -459,11 +478,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 		inode = dir;
 	} else if (mask & FS_EVENT_ON_CHILD) {
 		/*
-		 * Event on child - report on TYPE_INODE to dir if it is
-		 * watching children and on TYPE_CHILD to child.
+		 * Event on child - report on TYPE_PARENT to dir if it is
+		 * watching children and on TYPE_INODE to child.
 		 */
-		child = inode;
-		inode = dir;
+		parent = dir;
 	}
 	sb = inode->i_sb;
 
@@ -477,7 +495,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 	if (!sb->s_fsnotify_marks &&
 	    (!mnt || !mnt->mnt_fsnotify_marks) &&
 	    (!inode || !inode->i_fsnotify_marks) &&
-	    (!child || !child->i_fsnotify_marks))
+	    (!parent || !parent->i_fsnotify_marks))
 		return 0;
 
 	marks_mask = sb->s_fsnotify_mask;
@@ -485,8 +503,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 		marks_mask |= mnt->mnt_fsnotify_mask;
 	if (inode)
 		marks_mask |= inode->i_fsnotify_mask;
-	if (child)
-		marks_mask |= child->i_fsnotify_mask;
+	if (parent)
+		marks_mask |= parent->i_fsnotify_mask;
 
 
 	/*
@@ -509,9 +527,9 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 		iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
 			fsnotify_first_mark(&inode->i_fsnotify_marks);
 	}
-	if (child) {
-		iter_info.marks[FSNOTIFY_OBJ_TYPE_CHILD] =
-			fsnotify_first_mark(&child->i_fsnotify_marks);
+	if (parent) {
+		iter_info.marks[FSNOTIFY_OBJ_TYPE_PARENT] =
+			fsnotify_first_mark(&parent->i_fsnotify_marks);
 	}
 
 	/*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 4ee3044eedd0..a2e42d3cd87c 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -278,7 +278,7 @@ static inline const struct path *fsnotify_data_path(const void *data,
 
 enum fsnotify_obj_type {
 	FSNOTIFY_OBJ_TYPE_INODE,
-	FSNOTIFY_OBJ_TYPE_CHILD,
+	FSNOTIFY_OBJ_TYPE_PARENT,
 	FSNOTIFY_OBJ_TYPE_VFSMOUNT,
 	FSNOTIFY_OBJ_TYPE_SB,
 	FSNOTIFY_OBJ_TYPE_COUNT,
@@ -286,7 +286,7 @@ enum fsnotify_obj_type {
 };
 
 #define FSNOTIFY_OBJ_TYPE_INODE_FL	(1U << FSNOTIFY_OBJ_TYPE_INODE)
-#define FSNOTIFY_OBJ_TYPE_CHILD_FL	(1U << FSNOTIFY_OBJ_TYPE_CHILD)
+#define FSNOTIFY_OBJ_TYPE_PARENT_FL	(1U << FSNOTIFY_OBJ_TYPE_PARENT)
 #define FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL	(1U << FSNOTIFY_OBJ_TYPE_VFSMOUNT)
 #define FSNOTIFY_OBJ_TYPE_SB_FL		(1U << FSNOTIFY_OBJ_TYPE_SB)
 #define FSNOTIFY_OBJ_ALL_TYPES_MASK	((1U << FSNOTIFY_OBJ_TYPE_COUNT) - 1)
@@ -331,7 +331,7 @@ static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
 }
 
 FSNOTIFY_ITER_FUNCS(inode, INODE)
-FSNOTIFY_ITER_FUNCS(child, CHILD)
+FSNOTIFY_ITER_FUNCS(parent, PARENT)
 FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
 FSNOTIFY_ITER_FUNCS(sb, SB)
 
-- 
cgit v1.2.3


From 426506a7e0f1902268c3edbdc7e5475624a9d18b Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:24 +0200
Subject: dmaengine: ti: k3-udma-glue: Add function to get device pointer for
 DMA API

Glue layer users should use the device of the DMA for DMA mapping and
allocations as it is the DMA which accesses to descriptors and buffers,
not the clients

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-5-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma-glue.c    | 14 ++++++++++++++
 drivers/dma/ti/k3-udma-private.c |  6 ++++++
 drivers/dma/ti/k3-udma.h         |  1 +
 include/linux/dma/k3-udma-glue.h |  4 ++++
 4 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma/ti/k3-udma-glue.c b/drivers/dma/ti/k3-udma-glue.c
index dfb65e382ab9..29d1524d1916 100644
--- a/drivers/dma/ti/k3-udma-glue.c
+++ b/drivers/dma/ti/k3-udma-glue.c
@@ -493,6 +493,13 @@ int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn)
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_irq);
 
+struct device *
+	k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn)
+{
+	return xudma_get_device(tx_chn->common.udmax);
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_dma_device);
+
 static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 {
 	const struct udma_tisci_rm *tisci_rm = rx_chn->common.tisci_rm;
@@ -1201,3 +1208,10 @@ int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn,
 	return flow->virq;
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_irq);
+
+struct device *
+	k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn)
+{
+	return xudma_get_device(rx_chn->common.udmax);
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_dma_device);
diff --git a/drivers/dma/ti/k3-udma-private.c b/drivers/dma/ti/k3-udma-private.c
index aa24e554f7b4..8ff7a264be03 100644
--- a/drivers/dma/ti/k3-udma-private.c
+++ b/drivers/dma/ti/k3-udma-private.c
@@ -50,6 +50,12 @@ struct udma_dev *of_xudma_dev_get(struct device_node *np, const char *property)
 }
 EXPORT_SYMBOL(of_xudma_dev_get);
 
+struct device *xudma_get_device(struct udma_dev *ud)
+{
+	return ud->dev;
+}
+EXPORT_SYMBOL(xudma_get_device);
+
 u32 xudma_dev_get_psil_base(struct udma_dev *ud)
 {
 	return ud->psil_base;
diff --git a/drivers/dma/ti/k3-udma.h b/drivers/dma/ti/k3-udma.h
index 09c4529e013d..d1cace0cb43b 100644
--- a/drivers/dma/ti/k3-udma.h
+++ b/drivers/dma/ti/k3-udma.h
@@ -112,6 +112,7 @@ int xudma_navss_psil_unpair(struct udma_dev *ud, u32 src_thread,
 			    u32 dst_thread);
 
 struct udma_dev *of_xudma_dev_get(struct device_node *np, const char *property);
+struct device *xudma_get_device(struct udma_dev *ud);
 void xudma_dev_put(struct udma_dev *ud);
 u32 xudma_dev_get_psil_base(struct udma_dev *ud);
 struct udma_tisci_rm *xudma_dev_get_tisci_rm(struct udma_dev *ud);
diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h
index 5eb34ad973a7..d7c12f31377c 100644
--- a/include/linux/dma/k3-udma-glue.h
+++ b/include/linux/dma/k3-udma-glue.h
@@ -41,6 +41,8 @@ void k3_udma_glue_reset_tx_chn(struct k3_udma_glue_tx_channel *tx_chn,
 u32 k3_udma_glue_tx_get_hdesc_size(struct k3_udma_glue_tx_channel *tx_chn);
 u32 k3_udma_glue_tx_get_txcq_id(struct k3_udma_glue_tx_channel *tx_chn);
 int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn);
+struct device *
+	k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn);
 
 enum {
 	K3_UDMA_GLUE_SRC_TAG_LO_KEEP = 0,
@@ -130,5 +132,7 @@ int k3_udma_glue_rx_flow_enable(struct k3_udma_glue_rx_channel *rx_chn,
 				u32 flow_idx);
 int k3_udma_glue_rx_flow_disable(struct k3_udma_glue_rx_channel *rx_chn,
 				 u32 flow_idx);
+struct device *
+	k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn);
 
 #endif /* K3_UDMA_GLUE_H_ */
-- 
cgit v1.2.3


From 4f910c035f38053ac8eb63a672c78862c535cd0f Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:27 +0200
Subject: dmaengine: of-dma: Add support for optional router configuration
 callback

Additional configuration for the DMA event router might be needed for a
channel which can not be done during device_alloc_chan_resources callback
since the router information is not yet present for the drivers.

If there is a need for additional configuration for the channel if DMA
router is in use, then the driver can implement the device_router_config
callback.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-8-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/of-dma.c      | 10 ++++++++++
 include/linux/dmaengine.h |  2 ++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index 8a4f608904b9..ec00b20ae8e4 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -75,8 +75,18 @@ static struct dma_chan *of_dma_router_xlate(struct of_phandle_args *dma_spec,
 		ofdma->dma_router->route_free(ofdma->dma_router->dev,
 					      route_data);
 	} else {
+		int ret = 0;
+
 		chan->router = ofdma->dma_router;
 		chan->route_data = route_data;
+
+		if (chan->device->device_router_config)
+			ret = chan->device->device_router_config(chan);
+
+		if (ret) {
+			dma_release_channel(chan);
+			chan = ERR_PTR(ret);
+		}
 	}
 
 	/*
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 493a047ed0a2..aed44888cad3 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -805,6 +805,7 @@ struct dma_filter {
  *	by tx_status
  * @device_alloc_chan_resources: allocate resources and return the
  *	number of allocated descriptors
+ * @device_router_config: optional callback for DMA router configuration
  * @device_free_chan_resources: release DMA channel's resources
  * @device_prep_dma_memcpy: prepares a memcpy operation
  * @device_prep_dma_xor: prepares a xor operation
@@ -879,6 +880,7 @@ struct dma_device {
 	enum dma_residue_granularity residue_granularity;
 
 	int (*device_alloc_chan_resources)(struct dma_chan *chan);
+	int (*device_router_config)(struct dma_chan *chan);
 	void (*device_free_chan_resources)(struct dma_chan *chan);
 
 	struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)(
-- 
cgit v1.2.3


From ab650ef6d548153862119e1bf3bf267510707f48 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:28 +0200
Subject: dmaengine: Add support for per channel coherency handling

If the DMA device supports per channel coherency configuration (a channel
can be configured to have coherent or not coherent view) then a single
device (the DMA controller's device) can not be used for dma_api for all
channels as channels can have different coherency.

Introduce custom_dma_mapping flag for the dma_chan and a new helper to get
the device pointer to be used for dma_api for the given channel.

Client drivers should be updated to be able to support per channel
coherency by:

- dma_map_single(chan->device->dev, ptr, size, DMA_TO_DEVICE);
+ struct device *dma_dev = dmaengine_get_dma_device(chan);
+
+ dma_map_single(dma_dev, ptr, size, DMA_TO_DEVICE);

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-9-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dmaengine.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index aed44888cad3..68130f5f599e 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -357,11 +357,14 @@ struct dma_chan {
  * @chan: driver channel device
  * @device: sysfs device
  * @dev_id: parent dma_device dev_id
+ * @chan_dma_dev: The channel is using custom/different dma-mapping
+ * compared to the parent dma_device
  */
 struct dma_chan_dev {
 	struct dma_chan *chan;
 	struct device device;
 	int dev_id;
+	bool chan_dma_dev;
 };
 
 /**
@@ -1618,4 +1621,13 @@ dmaengine_get_direction_text(enum dma_transfer_direction dir)
 		return "invalid";
 	}
 }
+
+static inline struct device *dmaengine_get_dma_device(struct dma_chan *chan)
+{
+	if (chan->dev->chan_dma_dev)
+		return &chan->dev->device;
+
+	return chan->device->dev;
+}
+
 #endif /* DMAENGINE_H */
-- 
cgit v1.2.3


From b9366e2577a38ca5322f326cff9752c2008597c6 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:33 +0200
Subject: dmaengine: ti: k3-psil: Extend psil_endpoint_config for K3 PKTDMA

Additional fields needed for K3 PKTDMA to be able to handle the mapped
channels (channels are locked to handle specific threads) and flow ranges
for these mapped threads.
PKTDMA also introduces tflow for tx channels which can not be found in
K3 UDMA architecture.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-14-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/k3-psil.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma/k3-psil.h b/include/linux/dma/k3-psil.h
index 1962f75fa2d3..36e22c5a0f29 100644
--- a/include/linux/dma/k3-psil.h
+++ b/include/linux/dma/k3-psil.h
@@ -50,6 +50,15 @@ enum psil_endpoint_type {
  * @channel_tpl:	Desired throughput level for the channel
  * @pdma_acc32:		ACC32 must be enabled on the PDMA side
  * @pdma_burst:		BURST must be enabled on the PDMA side
+ * @mapped_channel_id:	PKTDMA thread to channel mapping for mapped channels.
+ *			The thread must be serviced by the specified channel if
+ *			mapped_channel_id is >= 0 in case of PKTDMA
+ * @flow_start:		PKDMA flow range start of mapped channel. Unmapped
+ *			channels use flow_id == chan_id
+ * @flow_num:		PKDMA flow count of mapped channel. Unmapped channels
+ *			use flow_id == chan_id
+ * @default_flow_id:	PKDMA default (r)flow index of mapped channel.
+ *			Must be within the flow range of the mapped channel.
  */
 struct psil_endpoint_config {
 	enum psil_endpoint_type ep_type;
@@ -63,6 +72,13 @@ struct psil_endpoint_config {
 	/* PDMA properties, valid for PSIL_EP_PDMA_* */
 	unsigned pdma_acc32:1;
 	unsigned pdma_burst:1;
+
+	/* PKDMA mapped channel */
+	int mapped_channel_id;
+	/* PKTDMA tflow and rflow ranges for mapped channel */
+	u16 flow_start;
+	u16 flow_num;
+	u16 default_flow_id;
 };
 
 int psil_set_new_ep_config(struct device *dev, const char *name,
-- 
cgit v1.2.3


From fc373e47d72605cc3f5012ddda49d2dca430d51f Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Tue, 8 Dec 2020 11:04:35 +0200
Subject: dmaengine: ti: Add support for k3 event routers

In k3 architecture a DMA channel (in TR momde) can be triggered by global
events, origination from different modules.

The events for triggers can be sent from any module which is connected to
PSI-L fabric, but the event number to be sent is DMA channel specific, it
is only known after the channel itself is requested.

The router operation needs to be split up:
- route_allocate: configure the dma_spec for the DMA and store the
  configuration which is needed for the router's input
- set_event: callback used by the DMA driver to set the event number for
  the channel and enable the routing

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-16-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/dma/k3-event-router.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 include/linux/dma/k3-event-router.h

(limited to 'include/linux')

diff --git a/include/linux/dma/k3-event-router.h b/include/linux/dma/k3-event-router.h
new file mode 100644
index 000000000000..e3f88b2f87be
--- /dev/null
+++ b/include/linux/dma/k3-event-router.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Copyright (C) 2020 Texas Instruments Incorporated - https://www.ti.com
+ */
+
+#ifndef K3_EVENT_ROUTER_
+#define K3_EVENT_ROUTER_
+
+#include <linux/types.h>
+
+struct k3_event_route_data {
+	void *priv;
+	int (*set_event)(void *priv, u32 event);
+};
+
+#endif /* K3_EVENT_ROUTER_ */
-- 
cgit v1.2.3


From d782298c6f6b854452965b56d91616dfb60490c5 Mon Sep 17 00:00:00 2001
From: Grygorii Strashko <grygorii.strashko@ti.com>
Date: Tue, 8 Dec 2020 11:04:36 +0200
Subject: soc: ti: k3-ringacc: add AM64 DMA rings support.

The DMAs in AM64 have built in rings compared to AM654/J721e/J7200 where a
separate and generic ringacc is used.

The ring SW interface is similar to ringacc with some major architectural
differences, like

They are part of the DMA (BCDMA or PKTDMA).

They are dual mode rings are modeled as pair of Rings objects which has
common configuration and memory buffer, but separate real-time control
register sets for each direction mem2dev (forward) and dev2mem (reverse).

The ringacc driver must be initialized for DMA rings use with
k3_ringacc_dmarings_init() as it is not an independent device as ringacc
is.

AM64 rings must be requested only using k3_ringacc_request_rings_pair(),
and forward ring must always be initialized/configured. After this any
other Ringacc APIs can be used without any callers changes.

Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-17-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soc/ti/k3-ringacc.c       | 325 +++++++++++++++++++++++++++++++++++++-
 include/linux/soc/ti/k3-ringacc.h |  17 ++
 2 files changed, 335 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/ti/k3-ringacc.c b/drivers/soc/ti/k3-ringacc.c
index 119164abcb41..c88c305ba367 100644
--- a/drivers/soc/ti/k3-ringacc.c
+++ b/drivers/soc/ti/k3-ringacc.c
@@ -11,6 +11,7 @@
 #include <linux/of.h>
 #include <linux/platform_device.h>
 #include <linux/sys_soc.h>
+#include <linux/dma/ti-cppi5.h>
 #include <linux/soc/ti/k3-ringacc.h>
 #include <linux/soc/ti/ti_sci_protocol.h>
 #include <linux/soc/ti/ti_sci_inta_msi.h>
@@ -21,6 +22,7 @@ static LIST_HEAD(k3_ringacc_list);
 static DEFINE_MUTEX(k3_ringacc_list_lock);
 
 #define K3_RINGACC_CFG_RING_SIZE_ELCNT_MASK		GENMASK(19, 0)
+#define K3_DMARING_CFG_RING_SIZE_ELCNT_MASK		GENMASK(15, 0)
 
 /**
  * struct k3_ring_rt_regs - The RA realtime Control/Status Registers region
@@ -43,7 +45,13 @@ struct k3_ring_rt_regs {
 	u32	hwindx;
 };
 
-#define K3_RINGACC_RT_REGS_STEP	0x1000
+#define K3_RINGACC_RT_REGS_STEP			0x1000
+#define K3_DMARING_RT_REGS_STEP			0x2000
+#define K3_DMARING_RT_REGS_REVERSE_OFS		0x1000
+#define K3_RINGACC_RT_OCC_MASK			GENMASK(20, 0)
+#define K3_DMARING_RT_OCC_TDOWN_COMPLETE	BIT(31)
+#define K3_DMARING_RT_DB_ENTRY_MASK		GENMASK(7, 0)
+#define K3_DMARING_RT_DB_TDOWN_ACK		BIT(31)
 
 /**
  * struct k3_ring_fifo_regs - The Ring Accelerator Queues Registers region
@@ -122,6 +130,7 @@ struct k3_ring_state {
 	u32 occ;
 	u32 windex;
 	u32 rindex;
+	u32 tdown_complete:1;
 };
 
 /**
@@ -143,6 +152,7 @@ struct k3_ring_state {
  * @use_count: Use count for shared rings
  * @proxy_id: RA Ring Proxy Id (only if @K3_RINGACC_RING_USE_PROXY)
  * @dma_dev: device to be used for DMA API (allocation, mapping)
+ * @asel: Address Space Select value for physical addresses
  */
 struct k3_ring {
 	struct k3_ring_rt_regs __iomem *rt;
@@ -157,12 +167,15 @@ struct k3_ring {
 	u32		flags;
 #define K3_RING_FLAG_BUSY	BIT(1)
 #define K3_RING_FLAG_SHARED	BIT(2)
+#define K3_RING_FLAG_REVERSE	BIT(3)
 	struct k3_ring_state state;
 	u32		ring_id;
 	struct k3_ringacc	*parent;
 	u32		use_count;
 	int		proxy_id;
 	struct device	*dma_dev;
+	u32		asel;
+#define K3_ADDRESS_ASEL_SHIFT	48
 };
 
 struct k3_ringacc_ops {
@@ -188,6 +201,7 @@ struct k3_ringacc_ops {
  * @tisci_ring_ops: ti-sci rings ops
  * @tisci_dev_id: ti-sci device id
  * @ops: SoC specific ringacc operation
+ * @dma_rings: indicate DMA ring (dual ring within BCDMA/PKTDMA)
  */
 struct k3_ringacc {
 	struct device *dev;
@@ -210,6 +224,7 @@ struct k3_ringacc {
 	u32 tisci_dev_id;
 
 	const struct k3_ringacc_ops *ops;
+	bool dma_rings;
 };
 
 /**
@@ -221,6 +236,21 @@ struct k3_ringacc_soc_data {
 	unsigned dma_ring_reset_quirk:1;
 };
 
+static int k3_ringacc_ring_read_occ(struct k3_ring *ring)
+{
+	return readl(&ring->rt->occ) & K3_RINGACC_RT_OCC_MASK;
+}
+
+static void k3_ringacc_ring_update_occ(struct k3_ring *ring)
+{
+	u32 val;
+
+	val = readl(&ring->rt->occ);
+
+	ring->state.occ = val & K3_RINGACC_RT_OCC_MASK;
+	ring->state.tdown_complete = !!(val & K3_DMARING_RT_OCC_TDOWN_COMPLETE);
+}
+
 static long k3_ringacc_ring_get_fifo_pos(struct k3_ring *ring)
 {
 	return K3_RINGACC_FIFO_WINDOW_SIZE_BYTES -
@@ -234,12 +264,24 @@ static void *k3_ringacc_get_elm_addr(struct k3_ring *ring, u32 idx)
 
 static int k3_ringacc_ring_push_mem(struct k3_ring *ring, void *elem);
 static int k3_ringacc_ring_pop_mem(struct k3_ring *ring, void *elem);
+static int k3_dmaring_fwd_pop(struct k3_ring *ring, void *elem);
+static int k3_dmaring_reverse_pop(struct k3_ring *ring, void *elem);
 
 static struct k3_ring_ops k3_ring_mode_ring_ops = {
 		.push_tail = k3_ringacc_ring_push_mem,
 		.pop_head = k3_ringacc_ring_pop_mem,
 };
 
+static struct k3_ring_ops k3_dmaring_fwd_ops = {
+		.push_tail = k3_ringacc_ring_push_mem,
+		.pop_head = k3_dmaring_fwd_pop,
+};
+
+static struct k3_ring_ops k3_dmaring_reverse_ops = {
+		/* Reverse side of the DMA ring can only be popped by SW */
+		.pop_head = k3_dmaring_reverse_pop,
+};
+
 static int k3_ringacc_ring_push_io(struct k3_ring *ring, void *elem);
 static int k3_ringacc_ring_pop_io(struct k3_ring *ring, void *elem);
 static int k3_ringacc_ring_push_head_io(struct k3_ring *ring, void *elem);
@@ -342,6 +384,40 @@ error:
 }
 EXPORT_SYMBOL_GPL(k3_ringacc_request_ring);
 
+static int k3_dmaring_request_dual_ring(struct k3_ringacc *ringacc, int fwd_id,
+					struct k3_ring **fwd_ring,
+					struct k3_ring **compl_ring)
+{
+	int ret = 0;
+
+	/*
+	 * DMA rings must be requested by ID, completion ring is the reverse
+	 * side of the forward ring
+	 */
+	if (fwd_id < 0)
+		return -EINVAL;
+
+	mutex_lock(&ringacc->req_lock);
+
+	if (test_bit(fwd_id, ringacc->rings_inuse)) {
+		ret = -EBUSY;
+		goto error;
+	}
+
+	*fwd_ring = &ringacc->rings[fwd_id];
+	*compl_ring = &ringacc->rings[fwd_id + ringacc->num_rings];
+	set_bit(fwd_id, ringacc->rings_inuse);
+	ringacc->rings[fwd_id].use_count++;
+	dev_dbg(ringacc->dev, "Giving ring#%d\n", fwd_id);
+
+	mutex_unlock(&ringacc->req_lock);
+	return 0;
+
+error:
+	mutex_unlock(&ringacc->req_lock);
+	return ret;
+}
+
 int k3_ringacc_request_rings_pair(struct k3_ringacc *ringacc,
 				  int fwd_id, int compl_id,
 				  struct k3_ring **fwd_ring,
@@ -352,6 +428,10 @@ int k3_ringacc_request_rings_pair(struct k3_ringacc *ringacc,
 	if (!fwd_ring || !compl_ring)
 		return -EINVAL;
 
+	if (ringacc->dma_rings)
+		return k3_dmaring_request_dual_ring(ringacc, fwd_id,
+						    fwd_ring, compl_ring);
+
 	*fwd_ring = k3_ringacc_request_ring(ringacc, fwd_id, 0);
 	if (!(*fwd_ring))
 		return -ENODEV;
@@ -421,7 +501,7 @@ void k3_ringacc_ring_reset_dma(struct k3_ring *ring, u32 occ)
 		goto reset;
 
 	if (!occ)
-		occ = readl(&ring->rt->occ);
+		occ = k3_ringacc_ring_read_occ(ring);
 
 	if (occ) {
 		u32 db_ring_cnt, db_ring_cnt_cur;
@@ -496,6 +576,13 @@ int k3_ringacc_ring_free(struct k3_ring *ring)
 
 	ringacc = ring->parent;
 
+	/*
+	 * DMA rings: rings shared memory and configuration, only forward ring
+	 * is configured and reverse ring considered as slave.
+	 */
+	if (ringacc->dma_rings && (ring->flags & K3_RING_FLAG_REVERSE))
+		return 0;
+
 	dev_dbg(ring->parent->dev, "flags: 0x%08x\n", ring->flags);
 
 	if (!test_bit(ring->ring_id, ringacc->rings_inuse))
@@ -517,6 +604,8 @@ int k3_ringacc_ring_free(struct k3_ring *ring)
 	ring->flags = 0;
 	ring->ops = NULL;
 	ring->dma_dev = NULL;
+	ring->asel = 0;
+
 	if (ring->proxy_id != K3_RINGACC_PROXY_NOT_USED) {
 		clear_bit(ring->proxy_id, ringacc->proxy_inuse);
 		ring->proxy = NULL;
@@ -581,6 +670,7 @@ static int k3_ringacc_ring_cfg_sci(struct k3_ring *ring)
 	ring_cfg.count = ring->size;
 	ring_cfg.mode = ring->mode;
 	ring_cfg.size = ring->elm_size;
+	ring_cfg.asel = ring->asel;
 
 	ret = ringacc->tisci_ring_ops->set_cfg(ringacc->tisci, &ring_cfg);
 	if (ret)
@@ -590,6 +680,90 @@ static int k3_ringacc_ring_cfg_sci(struct k3_ring *ring)
 	return ret;
 }
 
+static int k3_dmaring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
+{
+	struct k3_ringacc *ringacc;
+	struct k3_ring *reverse_ring;
+	int ret = 0;
+
+	if (cfg->elm_size != K3_RINGACC_RING_ELSIZE_8 ||
+	    cfg->mode != K3_RINGACC_RING_MODE_RING ||
+	    cfg->size & ~K3_DMARING_CFG_RING_SIZE_ELCNT_MASK)
+		return -EINVAL;
+
+	ringacc = ring->parent;
+
+	/*
+	 * DMA rings: rings shared memory and configuration, only forward ring
+	 * is configured and reverse ring considered as slave.
+	 */
+	if (ringacc->dma_rings && (ring->flags & K3_RING_FLAG_REVERSE))
+		return 0;
+
+	if (!test_bit(ring->ring_id, ringacc->rings_inuse))
+		return -EINVAL;
+
+	ring->size = cfg->size;
+	ring->elm_size = cfg->elm_size;
+	ring->mode = cfg->mode;
+	ring->asel = cfg->asel;
+	ring->dma_dev = cfg->dma_dev;
+	if (!ring->dma_dev) {
+		dev_warn(ringacc->dev, "dma_dev is not provided for ring%d\n",
+			 ring->ring_id);
+		ring->dma_dev = ringacc->dev;
+	}
+
+	memset(&ring->state, 0, sizeof(ring->state));
+
+	ring->ops = &k3_dmaring_fwd_ops;
+
+	ring->ring_mem_virt = dma_alloc_coherent(ring->dma_dev,
+						 ring->size * (4 << ring->elm_size),
+						 &ring->ring_mem_dma, GFP_KERNEL);
+	if (!ring->ring_mem_virt) {
+		dev_err(ringacc->dev, "Failed to alloc ring mem\n");
+		ret = -ENOMEM;
+		goto err_free_ops;
+	}
+
+	ret = k3_ringacc_ring_cfg_sci(ring);
+	if (ret)
+		goto err_free_mem;
+
+	ring->flags |= K3_RING_FLAG_BUSY;
+
+	k3_ringacc_ring_dump(ring);
+
+	/* DMA rings: configure reverse ring */
+	reverse_ring = &ringacc->rings[ring->ring_id + ringacc->num_rings];
+	reverse_ring->size = cfg->size;
+	reverse_ring->elm_size = cfg->elm_size;
+	reverse_ring->mode = cfg->mode;
+	reverse_ring->asel = cfg->asel;
+	memset(&reverse_ring->state, 0, sizeof(reverse_ring->state));
+	reverse_ring->ops = &k3_dmaring_reverse_ops;
+
+	reverse_ring->ring_mem_virt = ring->ring_mem_virt;
+	reverse_ring->ring_mem_dma = ring->ring_mem_dma;
+	reverse_ring->flags |= K3_RING_FLAG_BUSY;
+	k3_ringacc_ring_dump(reverse_ring);
+
+	return 0;
+
+err_free_mem:
+	dma_free_coherent(ring->dma_dev,
+			  ring->size * (4 << ring->elm_size),
+			  ring->ring_mem_virt,
+			  ring->ring_mem_dma);
+err_free_ops:
+	ring->ops = NULL;
+	ring->proxy = NULL;
+	ring->dma_dev = NULL;
+	ring->asel = 0;
+	return ret;
+}
+
 int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
 {
 	struct k3_ringacc *ringacc;
@@ -597,8 +771,12 @@ int k3_ringacc_ring_cfg(struct k3_ring *ring, struct k3_ring_cfg *cfg)
 
 	if (!ring || !cfg)
 		return -EINVAL;
+
 	ringacc = ring->parent;
 
+	if (ringacc->dma_rings)
+		return k3_dmaring_cfg(ring, cfg);
+
 	if (cfg->elm_size > K3_RINGACC_RING_ELSIZE_256 ||
 	    cfg->mode >= K3_RINGACC_RING_MODE_INVALID ||
 	    cfg->size & ~K3_RINGACC_CFG_RING_SIZE_ELCNT_MASK ||
@@ -705,7 +883,7 @@ u32 k3_ringacc_ring_get_free(struct k3_ring *ring)
 		return -EINVAL;
 
 	if (!ring->state.free)
-		ring->state.free = ring->size - readl(&ring->rt->occ);
+		ring->state.free = ring->size - k3_ringacc_ring_read_occ(ring);
 
 	return ring->state.free;
 }
@@ -716,7 +894,7 @@ u32 k3_ringacc_ring_get_occ(struct k3_ring *ring)
 	if (!ring || !(ring->flags & K3_RING_FLAG_BUSY))
 		return -EINVAL;
 
-	return readl(&ring->rt->occ);
+	return k3_ringacc_ring_read_occ(ring);
 }
 EXPORT_SYMBOL_GPL(k3_ringacc_ring_get_occ);
 
@@ -892,6 +1070,72 @@ static int k3_ringacc_ring_pop_tail_io(struct k3_ring *ring, void *elem)
 					 K3_RINGACC_ACCESS_MODE_POP_HEAD);
 }
 
+/*
+ * The element is 48 bits of address + ASEL bits in the ring.
+ * ASEL is used by the DMAs and should be removed for the kernel as it is not
+ * part of the physical memory address.
+ */
+static void k3_dmaring_remove_asel_from_elem(u64 *elem)
+{
+	*elem &= GENMASK_ULL(K3_ADDRESS_ASEL_SHIFT - 1, 0);
+}
+
+static int k3_dmaring_fwd_pop(struct k3_ring *ring, void *elem)
+{
+	void *elem_ptr;
+	u32 elem_idx;
+
+	/*
+	 * DMA rings: forward ring is always tied DMA channel and HW does not
+	 * maintain any state data required for POP operation and its unknown
+	 * how much elements were consumed by HW. So, to actually
+	 * do POP, the read pointer has to be recalculated every time.
+	 */
+	ring->state.occ = k3_ringacc_ring_read_occ(ring);
+	if (ring->state.windex >= ring->state.occ)
+		elem_idx = ring->state.windex - ring->state.occ;
+	else
+		elem_idx = ring->size - (ring->state.occ - ring->state.windex);
+
+	elem_ptr = k3_ringacc_get_elm_addr(ring, elem_idx);
+	memcpy(elem, elem_ptr, (4 << ring->elm_size));
+	k3_dmaring_remove_asel_from_elem(elem);
+
+	ring->state.occ--;
+	writel(-1, &ring->rt->db);
+
+	dev_dbg(ring->parent->dev, "%s: occ%d Windex%d Rindex%d pos_ptr%px\n",
+		__func__, ring->state.occ, ring->state.windex, elem_idx,
+		elem_ptr);
+	return 0;
+}
+
+static int k3_dmaring_reverse_pop(struct k3_ring *ring, void *elem)
+{
+	void *elem_ptr;
+
+	elem_ptr = k3_ringacc_get_elm_addr(ring, ring->state.rindex);
+
+	if (ring->state.occ) {
+		memcpy(elem, elem_ptr, (4 << ring->elm_size));
+		k3_dmaring_remove_asel_from_elem(elem);
+
+		ring->state.rindex = (ring->state.rindex + 1) % ring->size;
+		ring->state.occ--;
+		writel(-1 & K3_DMARING_RT_DB_ENTRY_MASK, &ring->rt->db);
+	} else if (ring->state.tdown_complete) {
+		dma_addr_t *value = elem;
+
+		*value = CPPI5_TDCM_MARKER;
+		writel(K3_DMARING_RT_DB_TDOWN_ACK, &ring->rt->db);
+		ring->state.tdown_complete = false;
+	}
+
+	dev_dbg(ring->parent->dev, "%s: occ%d index%d pos_ptr%px\n",
+		__func__, ring->state.occ, ring->state.rindex, elem_ptr);
+	return 0;
+}
+
 static int k3_ringacc_ring_push_mem(struct k3_ring *ring, void *elem)
 {
 	void *elem_ptr;
@@ -899,6 +1143,11 @@ static int k3_ringacc_ring_push_mem(struct k3_ring *ring, void *elem)
 	elem_ptr = k3_ringacc_get_elm_addr(ring, ring->state.windex);
 
 	memcpy(elem_ptr, elem, (4 << ring->elm_size));
+	if (ring->parent->dma_rings) {
+		u64 *addr = elem_ptr;
+
+		*addr |= ((u64)ring->asel << K3_ADDRESS_ASEL_SHIFT);
+	}
 
 	ring->state.windex = (ring->state.windex + 1) % ring->size;
 	ring->state.free--;
@@ -975,12 +1224,12 @@ int k3_ringacc_ring_pop(struct k3_ring *ring, void *elem)
 		return -EINVAL;
 
 	if (!ring->state.occ)
-		ring->state.occ = k3_ringacc_ring_get_occ(ring);
+		k3_ringacc_ring_update_occ(ring);
 
 	dev_dbg(ring->parent->dev, "ring_pop: occ%d index%d\n", ring->state.occ,
 		ring->state.rindex);
 
-	if (!ring->state.occ)
+	if (!ring->state.occ && !ring->state.tdown_complete)
 		return -ENODATA;
 
 	if (ring->ops && ring->ops->pop_head)
@@ -998,7 +1247,7 @@ int k3_ringacc_ring_pop_tail(struct k3_ring *ring, void *elem)
 		return -EINVAL;
 
 	if (!ring->state.occ)
-		ring->state.occ = k3_ringacc_ring_get_occ(ring);
+		k3_ringacc_ring_update_occ(ring);
 
 	dev_dbg(ring->parent->dev, "ring_pop_tail: occ%d index%d\n",
 		ring->state.occ, ring->state.rindex);
@@ -1203,6 +1452,68 @@ static const struct of_device_id k3_ringacc_of_match[] = {
 	{},
 };
 
+struct k3_ringacc *k3_ringacc_dmarings_init(struct platform_device *pdev,
+					    struct k3_ringacc_init_data *data)
+{
+	struct device *dev = &pdev->dev;
+	struct k3_ringacc *ringacc;
+	void __iomem *base_rt;
+	struct resource *res;
+	int i;
+
+	ringacc = devm_kzalloc(dev, sizeof(*ringacc), GFP_KERNEL);
+	if (!ringacc)
+		return ERR_PTR(-ENOMEM);
+
+	ringacc->dev = dev;
+	ringacc->dma_rings = true;
+	ringacc->num_rings = data->num_rings;
+	ringacc->tisci = data->tisci;
+	ringacc->tisci_dev_id = data->tisci_dev_id;
+
+	mutex_init(&ringacc->req_lock);
+
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ringrt");
+	base_rt = devm_ioremap_resource(dev, res);
+	if (IS_ERR(base_rt))
+		return base_rt;
+
+	ringacc->rings = devm_kzalloc(dev,
+				      sizeof(*ringacc->rings) *
+				      ringacc->num_rings * 2,
+				      GFP_KERNEL);
+	ringacc->rings_inuse = devm_kcalloc(dev,
+					    BITS_TO_LONGS(ringacc->num_rings),
+					    sizeof(unsigned long), GFP_KERNEL);
+
+	if (!ringacc->rings || !ringacc->rings_inuse)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < ringacc->num_rings; i++) {
+		struct k3_ring *ring = &ringacc->rings[i];
+
+		ring->rt = base_rt + K3_DMARING_RT_REGS_STEP * i;
+		ring->parent = ringacc;
+		ring->ring_id = i;
+		ring->proxy_id = K3_RINGACC_PROXY_NOT_USED;
+
+		ring = &ringacc->rings[ringacc->num_rings + i];
+		ring->rt = base_rt + K3_DMARING_RT_REGS_STEP * i +
+			   K3_DMARING_RT_REGS_REVERSE_OFS;
+		ring->parent = ringacc;
+		ring->ring_id = i;
+		ring->proxy_id = K3_RINGACC_PROXY_NOT_USED;
+		ring->flags = K3_RING_FLAG_REVERSE;
+	}
+
+	ringacc->tisci_ring_ops = &ringacc->tisci->ops.rm_ring_ops;
+
+	dev_info(dev, "Number of rings: %u\n", ringacc->num_rings);
+
+	return ringacc;
+}
+EXPORT_SYMBOL_GPL(k3_ringacc_dmarings_init);
+
 static int k3_ringacc_probe(struct platform_device *pdev)
 {
 	const struct ringacc_match_data *match_data;
diff --git a/include/linux/soc/ti/k3-ringacc.h b/include/linux/soc/ti/k3-ringacc.h
index 658dc71d2901..39b022b92598 100644
--- a/include/linux/soc/ti/k3-ringacc.h
+++ b/include/linux/soc/ti/k3-ringacc.h
@@ -70,6 +70,7 @@ struct k3_ring;
  * @dma_dev: Master device which is using and accessing to the ring
  *	memory when the mode is K3_RINGACC_RING_MODE_RING. Memory allocations
  *	should be done using this device.
+ * @asel: Address Space Select value for physical addresses
  */
 struct k3_ring_cfg {
 	u32 size;
@@ -79,6 +80,7 @@ struct k3_ring_cfg {
 	u32 flags;
 
 	struct device *dma_dev;
+	u32 asel;
 };
 
 #define K3_RINGACC_RING_ID_ANY (-1)
@@ -250,4 +252,19 @@ int k3_ringacc_ring_pop_tail(struct k3_ring *ring, void *elem);
 
 u32 k3_ringacc_get_tisci_dev_id(struct k3_ring *ring);
 
+/* DMA ring support */
+struct ti_sci_handle;
+
+/**
+ * struct struct k3_ringacc_init_data - Initialization data for DMA rings
+ */
+struct k3_ringacc_init_data {
+	const struct ti_sci_handle *tisci;
+	u32 tisci_dev_id;
+	u32 num_rings;
+};
+
+struct k3_ringacc *k3_ringacc_dmarings_init(struct platform_device *pdev,
+					    struct k3_ringacc_init_data *data);
+
 #endif /* __SOC_TI_K3_RINGACC_API_H_ */
-- 
cgit v1.2.3


From 5b65781d06ea90ef2f8e51a13352c43c3daa8cdc Mon Sep 17 00:00:00 2001
From: Vignesh Raghavendra <vigneshr@ti.com>
Date: Tue, 8 Dec 2020 11:04:40 +0200
Subject: dmaengine: ti: k3-udma-glue: Add support for K3 PKTDMA

This commit adds support for PKTDMA in k3-udma glue driver. Use new
psil_endpoint_config struct to get static data for a given channel or a
flow during setup.  Make sure that the RX flows being mapped to a RX
channel is within the range of flows that is been allocated to that RX
channel.

Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20201208090440.31792-21-peter.ujfalusi@ti.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/ti/k3-udma-glue.c    | 291 ++++++++++++++++++++++++++++++++++-----
 drivers/dma/ti/k3-udma-private.c |  24 ++++
 drivers/dma/ti/k3-udma.h         |   4 +
 include/linux/dma/k3-udma-glue.h |   8 ++
 4 files changed, 289 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/ti/k3-udma-glue.c b/drivers/dma/ti/k3-udma-glue.c
index e6ebcd98c02a..4fdd9f06b723 100644
--- a/drivers/dma/ti/k3-udma-glue.c
+++ b/drivers/dma/ti/k3-udma-glue.c
@@ -22,6 +22,7 @@
 
 struct k3_udma_glue_common {
 	struct device *dev;
+	struct device chan_dev;
 	struct udma_dev *udmax;
 	const struct udma_tisci_rm *tisci_rm;
 	struct k3_ringacc *ringacc;
@@ -32,7 +33,8 @@ struct k3_udma_glue_common {
 	bool epib;
 	u32  psdata_size;
 	u32  swdata_size;
-	u32  atype;
+	u32  atype_asel;
+	struct psil_endpoint_config *ep_config;
 };
 
 struct k3_udma_glue_tx_channel {
@@ -53,6 +55,8 @@ struct k3_udma_glue_tx_channel {
 	bool tx_filt_einfo;
 	bool tx_filt_pswords;
 	bool tx_supr_tdpkt;
+
+	int udma_tflow_id;
 };
 
 struct k3_udma_glue_rx_flow {
@@ -81,6 +85,16 @@ struct k3_udma_glue_rx_channel {
 	u32 flows_ready;
 };
 
+static void k3_udma_chan_dev_release(struct device *dev)
+{
+	/* The struct containing the device is devm managed */
+}
+
+static struct class k3_udma_glue_devclass = {
+	.name		= "k3_udma_glue_chan",
+	.dev_release	= k3_udma_chan_dev_release,
+};
+
 #define K3_UDMAX_TDOWN_TIMEOUT_US 1000
 
 static int of_k3_udma_glue_parse(struct device_node *udmax_np,
@@ -100,7 +114,6 @@ static int of_k3_udma_glue_parse_chn(struct device_node *chn_np,
 		const char *name, struct k3_udma_glue_common *common,
 		bool tx_chn)
 {
-	struct psil_endpoint_config *ep_config;
 	struct of_phandle_args dma_spec;
 	u32 thread_id;
 	int ret = 0;
@@ -117,15 +130,26 @@ static int of_k3_udma_glue_parse_chn(struct device_node *chn_np,
 				       &dma_spec))
 		return -ENOENT;
 
+	ret = of_k3_udma_glue_parse(dma_spec.np, common);
+	if (ret)
+		goto out_put_spec;
+
 	thread_id = dma_spec.args[0];
 	if (dma_spec.args_count == 2) {
-		if (dma_spec.args[1] > 2) {
+		if (dma_spec.args[1] > 2 && !xudma_is_pktdma(common->udmax)) {
 			dev_err(common->dev, "Invalid channel atype: %u\n",
 				dma_spec.args[1]);
 			ret = -EINVAL;
 			goto out_put_spec;
 		}
-		common->atype = dma_spec.args[1];
+		if (dma_spec.args[1] > 15 && xudma_is_pktdma(common->udmax)) {
+			dev_err(common->dev, "Invalid channel asel: %u\n",
+				dma_spec.args[1]);
+			ret = -EINVAL;
+			goto out_put_spec;
+		}
+
+		common->atype_asel = dma_spec.args[1];
 	}
 
 	if (tx_chn && !(thread_id & K3_PSIL_DST_THREAD_ID_OFFSET)) {
@@ -139,25 +163,23 @@ static int of_k3_udma_glue_parse_chn(struct device_node *chn_np,
 	}
 
 	/* get psil endpoint config */
-	ep_config = psil_get_ep_config(thread_id);
-	if (IS_ERR(ep_config)) {
+	common->ep_config = psil_get_ep_config(thread_id);
+	if (IS_ERR(common->ep_config)) {
 		dev_err(common->dev,
 			"No configuration for psi-l thread 0x%04x\n",
 			thread_id);
-		ret = PTR_ERR(ep_config);
+		ret = PTR_ERR(common->ep_config);
 		goto out_put_spec;
 	}
 
-	common->epib = ep_config->needs_epib;
-	common->psdata_size = ep_config->psd_size;
+	common->epib = common->ep_config->needs_epib;
+	common->psdata_size = common->ep_config->psd_size;
 
 	if (tx_chn)
 		common->dst_thread = thread_id;
 	else
 		common->src_thread = thread_id;
 
-	ret = of_k3_udma_glue_parse(dma_spec.np, common);
-
 out_put_spec:
 	of_node_put(dma_spec.np);
 	return ret;
@@ -223,7 +245,7 @@ static int k3_udma_glue_cfg_tx_chn(struct k3_udma_glue_tx_channel *tx_chn)
 		req.tx_supr_tdpkt = 1;
 	req.tx_fetch_size = tx_chn->common.hdesc_size >> 2;
 	req.txcq_qnum = k3_ringacc_get_ring_id(tx_chn->ringtxcq);
-	req.tx_atype = tx_chn->common.atype;
+	req.tx_atype = tx_chn->common.atype_asel;
 
 	return tisci_rm->tisci_udmap_ops->tx_ch_cfg(tisci_rm->tisci, &req);
 }
@@ -255,8 +277,14 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev,
 						tx_chn->common.psdata_size,
 						tx_chn->common.swdata_size);
 
+	if (xudma_is_pktdma(tx_chn->common.udmax))
+		tx_chn->udma_tchan_id = tx_chn->common.ep_config->mapped_channel_id;
+	else
+		tx_chn->udma_tchan_id = -1;
+
 	/* request and cfg UDMAP TX channel */
-	tx_chn->udma_tchanx = xudma_tchan_get(tx_chn->common.udmax, -1);
+	tx_chn->udma_tchanx = xudma_tchan_get(tx_chn->common.udmax,
+					      tx_chn->udma_tchan_id);
 	if (IS_ERR(tx_chn->udma_tchanx)) {
 		ret = PTR_ERR(tx_chn->udma_tchanx);
 		dev_err(dev, "UDMAX tchanx get err %d\n", ret);
@@ -264,11 +292,34 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev,
 	}
 	tx_chn->udma_tchan_id = xudma_tchan_get_id(tx_chn->udma_tchanx);
 
+	tx_chn->common.chan_dev.class = &k3_udma_glue_devclass;
+	tx_chn->common.chan_dev.parent = xudma_get_device(tx_chn->common.udmax);
+	dev_set_name(&tx_chn->common.chan_dev, "tchan%d-0x%04x",
+		     tx_chn->udma_tchan_id, tx_chn->common.dst_thread);
+	ret = device_register(&tx_chn->common.chan_dev);
+	if (ret) {
+		dev_err(dev, "Channel Device registration failed %d\n", ret);
+		tx_chn->common.chan_dev.parent = NULL;
+		goto err;
+	}
+
+	if (xudma_is_pktdma(tx_chn->common.udmax)) {
+		/* prepare the channel device as coherent */
+		tx_chn->common.chan_dev.dma_coherent = true;
+		dma_coerce_mask_and_coherent(&tx_chn->common.chan_dev,
+					     DMA_BIT_MASK(48));
+	}
+
 	atomic_set(&tx_chn->free_pkts, cfg->txcq_cfg.size);
 
+	if (xudma_is_pktdma(tx_chn->common.udmax))
+		tx_chn->udma_tflow_id = tx_chn->common.ep_config->default_flow_id;
+	else
+		tx_chn->udma_tflow_id = tx_chn->udma_tchan_id;
+
 	/* request and cfg rings */
 	ret =  k3_ringacc_request_rings_pair(tx_chn->common.ringacc,
-					     tx_chn->udma_tchan_id, -1,
+					     tx_chn->udma_tflow_id, -1,
 					     &tx_chn->ringtx,
 					     &tx_chn->ringtxcq);
 	if (ret) {
@@ -280,6 +331,12 @@ struct k3_udma_glue_tx_channel *k3_udma_glue_request_tx_chn(struct device *dev,
 	cfg->tx_cfg.dma_dev = k3_udma_glue_tx_get_dma_device(tx_chn);
 	cfg->txcq_cfg.dma_dev = cfg->tx_cfg.dma_dev;
 
+	/* Set the ASEL value for DMA rings of PKTDMA */
+	if (xudma_is_pktdma(tx_chn->common.udmax)) {
+		cfg->tx_cfg.asel = tx_chn->common.atype_asel;
+		cfg->txcq_cfg.asel = tx_chn->common.atype_asel;
+	}
+
 	ret = k3_ringacc_ring_cfg(tx_chn->ringtx, &cfg->tx_cfg);
 	if (ret) {
 		dev_err(dev, "Failed to cfg ringtx %d\n", ret);
@@ -331,6 +388,11 @@ void k3_udma_glue_release_tx_chn(struct k3_udma_glue_tx_channel *tx_chn)
 
 	if (tx_chn->ringtx)
 		k3_ringacc_ring_free(tx_chn->ringtx);
+
+	if (tx_chn->common.chan_dev.parent) {
+		device_unregister(&tx_chn->common.chan_dev);
+		tx_chn->common.chan_dev.parent = NULL;
+	}
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_release_tx_chn);
 
@@ -443,13 +505,10 @@ void k3_udma_glue_reset_tx_chn(struct k3_udma_glue_tx_channel *tx_chn,
 			       void *data,
 			       void (*cleanup)(void *data, dma_addr_t desc_dma))
 {
+	struct device *dev = tx_chn->common.dev;
 	dma_addr_t desc_dma;
 	int occ_tx, i, ret;
 
-	/* reset TXCQ as it is not input for udma - expected to be empty */
-	if (tx_chn->ringtxcq)
-		k3_ringacc_ring_reset(tx_chn->ringtxcq);
-
 	/*
 	 * TXQ reset need to be special way as it is input for udma and its
 	 * state cached by udma, so:
@@ -458,17 +517,20 @@ void k3_udma_glue_reset_tx_chn(struct k3_udma_glue_tx_channel *tx_chn,
 	 * 3) reset TXQ in a special way
 	 */
 	occ_tx = k3_ringacc_ring_get_occ(tx_chn->ringtx);
-	dev_dbg(tx_chn->common.dev, "TX reset occ_tx %u\n", occ_tx);
+	dev_dbg(dev, "TX reset occ_tx %u\n", occ_tx);
 
 	for (i = 0; i < occ_tx; i++) {
 		ret = k3_ringacc_ring_pop(tx_chn->ringtx, &desc_dma);
 		if (ret) {
-			dev_err(tx_chn->common.dev, "TX reset pop %d\n", ret);
+			if (ret != -ENODATA)
+				dev_err(dev, "TX reset pop %d\n", ret);
 			break;
 		}
 		cleanup(data, desc_dma);
 	}
 
+	/* reset TXCQ as it is not input for udma - expected to be empty */
+	k3_ringacc_ring_reset(tx_chn->ringtxcq);
 	k3_ringacc_ring_reset_dma(tx_chn->ringtx, occ_tx);
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_reset_tx_chn);
@@ -487,7 +549,12 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_txcq_id);
 
 int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn)
 {
-	tx_chn->virq = k3_ringacc_get_ring_irq_num(tx_chn->ringtxcq);
+	if (xudma_is_pktdma(tx_chn->common.udmax)) {
+		tx_chn->virq = xudma_pktdma_tflow_get_irq(tx_chn->common.udmax,
+							  tx_chn->udma_tflow_id);
+	} else {
+		tx_chn->virq = k3_ringacc_get_ring_irq_num(tx_chn->ringtxcq);
+	}
 
 	return tx_chn->virq;
 }
@@ -496,10 +563,36 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_irq);
 struct device *
 	k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn)
 {
+	if (xudma_is_pktdma(tx_chn->common.udmax) &&
+	    (tx_chn->common.atype_asel == 14 || tx_chn->common.atype_asel == 15))
+		return &tx_chn->common.chan_dev;
+
 	return xudma_get_device(tx_chn->common.udmax);
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_tx_get_dma_device);
 
+void k3_udma_glue_tx_dma_to_cppi5_addr(struct k3_udma_glue_tx_channel *tx_chn,
+				       dma_addr_t *addr)
+{
+	if (!xudma_is_pktdma(tx_chn->common.udmax) ||
+	    !tx_chn->common.atype_asel)
+		return;
+
+	*addr |= (u64)tx_chn->common.atype_asel << K3_ADDRESS_ASEL_SHIFT;
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_tx_dma_to_cppi5_addr);
+
+void k3_udma_glue_tx_cppi5_to_dma_addr(struct k3_udma_glue_tx_channel *tx_chn,
+				       dma_addr_t *addr)
+{
+	if (!xudma_is_pktdma(tx_chn->common.udmax) ||
+	    !tx_chn->common.atype_asel)
+		return;
+
+	*addr &= (u64)GENMASK(K3_ADDRESS_ASEL_SHIFT - 1, 0);
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_tx_cppi5_to_dma_addr);
+
 static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 {
 	const struct udma_tisci_rm *tisci_rm = rx_chn->common.tisci_rm;
@@ -511,8 +604,6 @@ static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 	req.valid_params = TI_SCI_MSG_VALUE_RM_UDMAP_CH_FETCH_SIZE_VALID |
 			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_CQ_QNUM_VALID |
 			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_CHAN_TYPE_VALID |
-			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_START_VALID |
-			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_CNT_VALID |
 			   TI_SCI_MSG_VALUE_RM_UDMAP_CH_ATYPE_VALID;
 
 	req.nav_id = tisci_rm->tisci_dev_id;
@@ -524,13 +615,16 @@ static int k3_udma_glue_cfg_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 	 * req.rxcq_qnum = k3_ringacc_get_ring_id(rx_chn->flows[0].ringrx);
 	 */
 	req.rxcq_qnum = 0xFFFF;
-	if (rx_chn->flow_num && rx_chn->flow_id_base != rx_chn->udma_rchan_id) {
+	if (!xudma_is_pktdma(rx_chn->common.udmax) && rx_chn->flow_num &&
+	    rx_chn->flow_id_base != rx_chn->udma_rchan_id) {
 		/* Default flow + extra ones */
+		req.valid_params |= TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_START_VALID |
+				    TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_CNT_VALID;
 		req.flowid_start = rx_chn->flow_id_base;
 		req.flowid_cnt = rx_chn->flow_num;
 	}
 	req.rx_chan_type = TI_SCI_RM_UDMAP_CHAN_TYPE_PKT_PBRR;
-	req.rx_atype = rx_chn->common.atype;
+	req.rx_atype = rx_chn->common.atype_asel;
 
 	ret = tisci_rm->tisci_udmap_ops->rx_ch_cfg(tisci_rm->tisci, &req);
 	if (ret)
@@ -584,10 +678,18 @@ static int k3_udma_glue_cfg_rx_flow(struct k3_udma_glue_rx_channel *rx_chn,
 		goto err_rflow_put;
 	}
 
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		rx_ringfdq_id = flow->udma_rflow_id +
+				xudma_get_rflow_ring_offset(rx_chn->common.udmax);
+		rx_ring_id = 0;
+	} else {
+		rx_ring_id = flow_cfg->ring_rxq_id;
+		rx_ringfdq_id = flow_cfg->ring_rxfdq0_id;
+	}
+
 	/* request and cfg rings */
 	ret =  k3_ringacc_request_rings_pair(rx_chn->common.ringacc,
-					     flow_cfg->ring_rxfdq0_id,
-					     flow_cfg->ring_rxq_id,
+					     rx_ringfdq_id, rx_ring_id,
 					     &flow->ringrxfdq,
 					     &flow->ringrx);
 	if (ret) {
@@ -599,6 +701,12 @@ static int k3_udma_glue_cfg_rx_flow(struct k3_udma_glue_rx_channel *rx_chn,
 	flow_cfg->rx_cfg.dma_dev = k3_udma_glue_rx_get_dma_device(rx_chn);
 	flow_cfg->rxfdq_cfg.dma_dev = flow_cfg->rx_cfg.dma_dev;
 
+	/* Set the ASEL value for DMA rings of PKTDMA */
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		flow_cfg->rx_cfg.asel = rx_chn->common.atype_asel;
+		flow_cfg->rxfdq_cfg.asel = rx_chn->common.atype_asel;
+	}
+
 	ret = k3_ringacc_ring_cfg(flow->ringrx, &flow_cfg->rx_cfg);
 	if (ret) {
 		dev_err(dev, "Failed to cfg ringrx %d\n", ret);
@@ -757,6 +865,7 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name,
 				 struct k3_udma_glue_rx_channel_cfg *cfg)
 {
 	struct k3_udma_glue_rx_channel *rx_chn;
+	struct psil_endpoint_config *ep_cfg;
 	int ret, i;
 
 	if (cfg->flow_id_num <= 0)
@@ -784,8 +893,16 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name,
 						rx_chn->common.psdata_size,
 						rx_chn->common.swdata_size);
 
+	ep_cfg = rx_chn->common.ep_config;
+
+	if (xudma_is_pktdma(rx_chn->common.udmax))
+		rx_chn->udma_rchan_id = ep_cfg->mapped_channel_id;
+	else
+		rx_chn->udma_rchan_id = -1;
+
 	/* request and cfg UDMAP RX channel */
-	rx_chn->udma_rchanx = xudma_rchan_get(rx_chn->common.udmax, -1);
+	rx_chn->udma_rchanx = xudma_rchan_get(rx_chn->common.udmax,
+					      rx_chn->udma_rchan_id);
 	if (IS_ERR(rx_chn->udma_rchanx)) {
 		ret = PTR_ERR(rx_chn->udma_rchanx);
 		dev_err(dev, "UDMAX rchanx get err %d\n", ret);
@@ -793,12 +910,48 @@ k3_udma_glue_request_rx_chn_priv(struct device *dev, const char *name,
 	}
 	rx_chn->udma_rchan_id = xudma_rchan_get_id(rx_chn->udma_rchanx);
 
-	rx_chn->flow_num = cfg->flow_id_num;
-	rx_chn->flow_id_base = cfg->flow_id_base;
+	rx_chn->common.chan_dev.class = &k3_udma_glue_devclass;
+	rx_chn->common.chan_dev.parent = xudma_get_device(rx_chn->common.udmax);
+	dev_set_name(&rx_chn->common.chan_dev, "rchan%d-0x%04x",
+		     rx_chn->udma_rchan_id, rx_chn->common.src_thread);
+	ret = device_register(&rx_chn->common.chan_dev);
+	if (ret) {
+		dev_err(dev, "Channel Device registration failed %d\n", ret);
+		rx_chn->common.chan_dev.parent = NULL;
+		goto err;
+	}
 
-	/* Use RX channel id as flow id: target dev can't generate flow_id */
-	if (cfg->flow_id_use_rxchan_id)
-		rx_chn->flow_id_base = rx_chn->udma_rchan_id;
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		/* prepare the channel device as coherent */
+		rx_chn->common.chan_dev.dma_coherent = true;
+		dma_coerce_mask_and_coherent(&rx_chn->common.chan_dev,
+					     DMA_BIT_MASK(48));
+	}
+
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		int flow_start = cfg->flow_id_base;
+		int flow_end;
+
+		if (flow_start == -1)
+			flow_start = ep_cfg->flow_start;
+
+		flow_end = flow_start + cfg->flow_id_num - 1;
+		if (flow_start < ep_cfg->flow_start ||
+		    flow_end > (ep_cfg->flow_start + ep_cfg->flow_num - 1)) {
+			dev_err(dev, "Invalid flow range requested\n");
+			ret = -EINVAL;
+			goto err;
+		}
+		rx_chn->flow_id_base = flow_start;
+	} else {
+		rx_chn->flow_id_base = cfg->flow_id_base;
+
+		/* Use RX channel id as flow id: target dev can't generate flow_id */
+		if (cfg->flow_id_use_rxchan_id)
+			rx_chn->flow_id_base = rx_chn->udma_rchan_id;
+	}
+
+	rx_chn->flow_num = cfg->flow_id_num;
 
 	rx_chn->flows = devm_kcalloc(dev, rx_chn->flow_num,
 				     sizeof(*rx_chn->flows), GFP_KERNEL);
@@ -888,6 +1041,24 @@ k3_udma_glue_request_remote_rx_chn(struct device *dev, const char *name,
 		goto err;
 	}
 
+	rx_chn->common.chan_dev.class = &k3_udma_glue_devclass;
+	rx_chn->common.chan_dev.parent = xudma_get_device(rx_chn->common.udmax);
+	dev_set_name(&rx_chn->common.chan_dev, "rchan_remote-0x%04x",
+		     rx_chn->common.src_thread);
+	ret = device_register(&rx_chn->common.chan_dev);
+	if (ret) {
+		dev_err(dev, "Channel Device registration failed %d\n", ret);
+		rx_chn->common.chan_dev.parent = NULL;
+		goto err;
+	}
+
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		/* prepare the channel device as coherent */
+		rx_chn->common.chan_dev.dma_coherent = true;
+		dma_coerce_mask_and_coherent(&rx_chn->common.chan_dev,
+					     DMA_BIT_MASK(48));
+	}
+
 	ret = k3_udma_glue_allocate_rx_flows(rx_chn, cfg);
 	if (ret)
 		goto err;
@@ -940,6 +1111,11 @@ void k3_udma_glue_release_rx_chn(struct k3_udma_glue_rx_channel *rx_chn)
 	if (!IS_ERR_OR_NULL(rx_chn->udma_rchanx))
 		xudma_rchan_put(rx_chn->common.udmax,
 				rx_chn->udma_rchanx);
+
+	if (rx_chn->common.chan_dev.parent) {
+		device_unregister(&rx_chn->common.chan_dev);
+		rx_chn->common.chan_dev.parent = NULL;
+	}
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_release_rx_chn);
 
@@ -1151,12 +1327,10 @@ void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn,
 	/* reset RXCQ as it is not input for udma - expected to be empty */
 	occ_rx = k3_ringacc_ring_get_occ(flow->ringrx);
 	dev_dbg(dev, "RX reset flow %u occ_rx %u\n", flow_num, occ_rx);
-	if (flow->ringrx)
-		k3_ringacc_ring_reset(flow->ringrx);
 
 	/* Skip RX FDQ in case one FDQ is used for the set of flows */
 	if (skip_fdq)
-		return;
+		goto do_reset;
 
 	/*
 	 * RX FDQ reset need to be special way as it is input for udma and its
@@ -1171,13 +1345,17 @@ void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn,
 	for (i = 0; i < occ_rx; i++) {
 		ret = k3_ringacc_ring_pop(flow->ringrxfdq, &desc_dma);
 		if (ret) {
-			dev_err(dev, "RX reset pop %d\n", ret);
+			if (ret != -ENODATA)
+				dev_err(dev, "RX reset pop %d\n", ret);
 			break;
 		}
 		cleanup(data, desc_dma);
 	}
 
 	k3_ringacc_ring_reset_dma(flow->ringrxfdq, occ_rx);
+
+do_reset:
+	k3_ringacc_ring_reset(flow->ringrx);
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_reset_rx_chn);
 
@@ -1207,7 +1385,12 @@ int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn,
 
 	flow = &rx_chn->flows[flow_num];
 
-	flow->virq = k3_ringacc_get_ring_irq_num(flow->ringrx);
+	if (xudma_is_pktdma(rx_chn->common.udmax)) {
+		flow->virq = xudma_pktdma_rflow_get_irq(rx_chn->common.udmax,
+							flow->udma_rflow_id);
+	} else {
+		flow->virq = k3_ringacc_get_ring_irq_num(flow->ringrx);
+	}
 
 	return flow->virq;
 }
@@ -1216,6 +1399,38 @@ EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_irq);
 struct device *
 	k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn)
 {
+	if (xudma_is_pktdma(rx_chn->common.udmax) &&
+	    (rx_chn->common.atype_asel == 14 || rx_chn->common.atype_asel == 15))
+		return &rx_chn->common.chan_dev;
+
 	return xudma_get_device(rx_chn->common.udmax);
 }
 EXPORT_SYMBOL_GPL(k3_udma_glue_rx_get_dma_device);
+
+void k3_udma_glue_rx_dma_to_cppi5_addr(struct k3_udma_glue_rx_channel *rx_chn,
+				       dma_addr_t *addr)
+{
+	if (!xudma_is_pktdma(rx_chn->common.udmax) ||
+	    !rx_chn->common.atype_asel)
+		return;
+
+	*addr |= (u64)rx_chn->common.atype_asel << K3_ADDRESS_ASEL_SHIFT;
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_rx_dma_to_cppi5_addr);
+
+void k3_udma_glue_rx_cppi5_to_dma_addr(struct k3_udma_glue_rx_channel *rx_chn,
+				       dma_addr_t *addr)
+{
+	if (!xudma_is_pktdma(rx_chn->common.udmax) ||
+	    !rx_chn->common.atype_asel)
+		return;
+
+	*addr &= (u64)GENMASK(K3_ADDRESS_ASEL_SHIFT - 1, 0);
+}
+EXPORT_SYMBOL_GPL(k3_udma_glue_rx_cppi5_to_dma_addr);
+
+static int __init k3_udma_glue_class_init(void)
+{
+	return class_register(&k3_udma_glue_devclass);
+}
+arch_initcall(k3_udma_glue_class_init);
diff --git a/drivers/dma/ti/k3-udma-private.c b/drivers/dma/ti/k3-udma-private.c
index 5436b19d656e..eb4795c089bd 100644
--- a/drivers/dma/ti/k3-udma-private.c
+++ b/drivers/dma/ti/k3-udma-private.c
@@ -157,3 +157,27 @@ void xudma_##res##rt_write(struct udma_##res *p, int reg, u32 val)	\
 EXPORT_SYMBOL(xudma_##res##rt_write)
 XUDMA_RT_IO_FUNCTIONS(tchan);
 XUDMA_RT_IO_FUNCTIONS(rchan);
+
+int xudma_is_pktdma(struct udma_dev *ud)
+{
+	return ud->match_data->type == DMA_TYPE_PKTDMA;
+}
+EXPORT_SYMBOL(xudma_is_pktdma);
+
+int xudma_pktdma_tflow_get_irq(struct udma_dev *ud, int udma_tflow_id)
+{
+	const struct udma_oes_offsets *oes = &ud->soc_data->oes;
+
+	return ti_sci_inta_msi_get_virq(ud->dev, udma_tflow_id +
+					oes->pktdma_tchan_flow);
+}
+EXPORT_SYMBOL(xudma_pktdma_tflow_get_irq);
+
+int xudma_pktdma_rflow_get_irq(struct udma_dev *ud, int udma_rflow_id)
+{
+	const struct udma_oes_offsets *oes = &ud->soc_data->oes;
+
+	return ti_sci_inta_msi_get_virq(ud->dev, udma_rflow_id +
+					oes->pktdma_rchan_flow);
+}
+EXPORT_SYMBOL(xudma_pktdma_rflow_get_irq);
diff --git a/drivers/dma/ti/k3-udma.h b/drivers/dma/ti/k3-udma.h
index ccb19f286daf..d349c6d482ae 100644
--- a/drivers/dma/ti/k3-udma.h
+++ b/drivers/dma/ti/k3-udma.h
@@ -157,4 +157,8 @@ void xudma_rchanrt_write(struct udma_rchan *rchan, int reg, u32 val);
 bool xudma_rflow_is_gp(struct udma_dev *ud, int id);
 int xudma_get_rflow_ring_offset(struct udma_dev *ud);
 
+int xudma_is_pktdma(struct udma_dev *ud);
+
+int xudma_pktdma_tflow_get_irq(struct udma_dev *ud, int udma_tflow_id);
+int xudma_pktdma_rflow_get_irq(struct udma_dev *ud, int udma_rflow_id);
 #endif /* K3_UDMA_H_ */
diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h
index d7c12f31377c..e443be4d3b4b 100644
--- a/include/linux/dma/k3-udma-glue.h
+++ b/include/linux/dma/k3-udma-glue.h
@@ -43,6 +43,10 @@ u32 k3_udma_glue_tx_get_txcq_id(struct k3_udma_glue_tx_channel *tx_chn);
 int k3_udma_glue_tx_get_irq(struct k3_udma_glue_tx_channel *tx_chn);
 struct device *
 	k3_udma_glue_tx_get_dma_device(struct k3_udma_glue_tx_channel *tx_chn);
+void k3_udma_glue_tx_dma_to_cppi5_addr(struct k3_udma_glue_tx_channel *tx_chn,
+				       dma_addr_t *addr);
+void k3_udma_glue_tx_cppi5_to_dma_addr(struct k3_udma_glue_tx_channel *tx_chn,
+				       dma_addr_t *addr);
 
 enum {
 	K3_UDMA_GLUE_SRC_TAG_LO_KEEP = 0,
@@ -134,5 +138,9 @@ int k3_udma_glue_rx_flow_disable(struct k3_udma_glue_rx_channel *rx_chn,
 				 u32 flow_idx);
 struct device *
 	k3_udma_glue_rx_get_dma_device(struct k3_udma_glue_rx_channel *rx_chn);
+void k3_udma_glue_rx_dma_to_cppi5_addr(struct k3_udma_glue_rx_channel *rx_chn,
+				       dma_addr_t *addr);
+void k3_udma_glue_rx_cppi5_to_dma_addr(struct k3_udma_glue_rx_channel *rx_chn,
+				       dma_addr_t *addr);
 
 #endif /* K3_UDMA_GLUE_H_ */
-- 
cgit v1.2.3


From 9a20f6f4e6ba9713605fbf7e7426ca22f1181545 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 4 Dec 2020 15:16:46 -0500
Subject: SUNRPC: Fixes for xdr_align_data()

The main use case right now for xdr_align_data() is to shift the page
data to the left, and in practice shrink the total XDR data buffer.
This patch ensures that we fix up the accounting for the buffer length
as we shift that data around.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |   2 +-
 net/sunrpc/xdr.c           | 174 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 133 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 9548d075e06d..2b4e44bb0654 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -252,7 +252,7 @@ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
-extern uint64_t xdr_align_data(struct xdr_stream *, uint64_t, uint32_t);
+extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t);
 
 /**
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 5833329c132c..c474339ba9ac 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -266,26 +266,6 @@ _shift_data_left_pages(struct page **pages, size_t pgto_base,
 	} while ((len -= copy) != 0);
 }
 
-static void
-_shift_data_left_tail(struct xdr_buf *buf, unsigned int pgto, size_t len)
-{
-	struct kvec *tail = buf->tail;
-
-	if (len > tail->iov_len)
-		len = tail->iov_len;
-
-	_copy_to_pages(buf->pages,
-		       buf->page_base + pgto,
-		       (char *)tail->iov_base,
-		       len);
-	tail->iov_len -= len;
-
-	if (tail->iov_len > 0)
-		memmove((char *)tail->iov_base,
-				tail->iov_base + len,
-				tail->iov_len);
-}
-
 /**
  * _shift_data_right_pages
  * @pages: vector of pages containing both the source and dest memory area.
@@ -516,6 +496,109 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len)
 	} while ((len -= zero) != 0);
 }
 
+static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base,
+				   unsigned int len, unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+
+	if (base >= tail->iov_len)
+		return;
+	if (len > tail->iov_len - base)
+		len = tail->iov_len - base;
+	/* Shift data into head */
+	if (shift > buf->page_len + base) {
+		const struct kvec *head = buf->head;
+		unsigned int hdto =
+			head->iov_len + buf->page_len + base - shift;
+		unsigned int hdlen = len;
+
+		if (WARN_ONCE(shift > head->iov_len + buf->page_len + base,
+			      "SUNRPC: Misaligned data.\n"))
+			return;
+		if (hdto + hdlen > head->iov_len)
+			hdlen = head->iov_len - hdto;
+		memcpy(head->iov_base + hdto, tail->iov_base + base, hdlen);
+		base += hdlen;
+		len -= hdlen;
+		if (!len)
+			return;
+	}
+	/* Shift data into pages */
+	if (shift > base) {
+		unsigned int pgto = buf->page_len + base - shift;
+		unsigned int pglen = len;
+
+		if (pgto + pglen > buf->page_len)
+			pglen = buf->page_len - pgto;
+		_copy_to_pages(buf->pages, buf->page_base + pgto,
+			       tail->iov_base + base, pglen);
+		base += pglen;
+		len -= pglen;
+		if (!len)
+			return;
+	}
+	memmove(tail->iov_base + base - shift, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_left(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	unsigned int pgto;
+
+	if (base >= buf->page_len)
+		return;
+	if (len > buf->page_len - base)
+		len = buf->page_len - base;
+	/* Shift data into head */
+	if (shift > base) {
+		const struct kvec *head = buf->head;
+		unsigned int hdto = head->iov_len + base - shift;
+		unsigned int hdlen = len;
+
+		if (WARN_ONCE(shift > head->iov_len + base,
+			      "SUNRPC: Misaligned data.\n"))
+			return;
+		if (hdto + hdlen > head->iov_len)
+			hdlen = head->iov_len - hdto;
+		_copy_from_pages(head->iov_base + hdto, buf->pages,
+				 buf->page_base + base, hdlen);
+		base += hdlen;
+		len -= hdlen;
+		if (!len)
+			return;
+	}
+	pgto = base - shift;
+	_shift_data_left_pages(buf->pages, buf->page_base + pgto,
+			       buf->page_base + base, len);
+}
+
+static void xdr_buf_tail_shift_left(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	if (!shift || !len)
+		return;
+	xdr_buf_tail_copy_left(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_left(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	if (!shift || !len)
+		return;
+	if (base >= buf->page_len) {
+		xdr_buf_tail_shift_left(buf, base - buf->page_len, len, shift);
+		return;
+	}
+	xdr_buf_pages_copy_left(buf, base, len, shift);
+	len += base;
+	if (len <= buf->page_len)
+		return;
+	xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift);
+}
+
 /**
  * xdr_shrink_bufhead
  * @buf: xdr_buf
@@ -1261,38 +1344,45 @@ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
 }
 EXPORT_SYMBOL_GPL(xdr_read_pages);
 
-uint64_t xdr_align_data(struct xdr_stream *xdr, uint64_t offset, uint32_t length)
+unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
+			    unsigned int length)
 {
 	struct xdr_buf *buf = xdr->buf;
-	unsigned int from, bytes;
-	unsigned int shift = 0;
-
-	if ((offset + length) < offset ||
-	    (offset + length) > buf->page_len)
-		length = buf->page_len - offset;
+	unsigned int from, bytes, len;
+	unsigned int shift;
 
 	xdr_realign_pages(xdr);
 	from = xdr_page_pos(xdr);
-	bytes = xdr_stream_remaining(xdr);
-	if (length < bytes)
-		bytes = length;
+
+	if (from >= buf->page_len + buf->tail->iov_len)
+		return 0;
+	if (from + buf->head->iov_len >= buf->len)
+		return 0;
+
+	len = buf->len - buf->head->iov_len;
+
+	/* We only shift data left! */
+	if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n",
+		      from, offset))
+		return 0;
+	if (WARN_ONCE(offset > buf->page_len,
+		      "SUNRPC: buffer overflow. offset=%u, page_len=%u\n",
+		      offset, buf->page_len))
+		return 0;
 
 	/* Move page data to the left */
-	if (from > offset) {
-		shift = min_t(unsigned int, bytes, buf->page_len - from);
-		_shift_data_left_pages(buf->pages,
-				       buf->page_base + offset,
-				       buf->page_base + from,
-				       shift);
-		bytes -= shift;
+	shift = from - offset;
+	xdr_buf_pages_shift_left(buf, from, len, shift);
+	xdr->buf->len -= shift;
+	xdr->nwords -= XDR_QUADLEN(shift);
 
-		/* Move tail data into the pages, if necessary */
-		if (bytes > 0)
-			_shift_data_left_tail(buf, offset + shift, bytes);
-	}
+	bytes = xdr_stream_remaining(xdr);
+	if (length > bytes)
+		length = bytes;
+	bytes -= length;
 
 	xdr->nwords -= XDR_QUADLEN(length);
-	xdr_set_page(xdr, from + length, xdr_stream_remaining(xdr));
+	xdr_set_page(xdr, offset + length, bytes);
 	return length;
 }
 EXPORT_SYMBOL_GPL(xdr_align_data);
-- 
cgit v1.2.3


From c4f2f591f02c392ea7de018d2733748bf4c7b5f5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Fri, 4 Dec 2020 17:15:09 -0500
Subject: SUNRPC: Fix xdr_expand_hole()

We do want to try to grow the buffer if possible, but if that attempt
fails, we still want to move the data and truncate the XDR message.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h |   2 +-
 net/sunrpc/xdr.c           | 274 +++++++++++++++++++++++++++++----------------
 2 files changed, 180 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 2b4e44bb0654..178f499e2283 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -253,7 +253,7 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
 extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
 extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
-extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t);
+extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
 
 /**
  * xdr_stream_remaining - Return the number of bytes remaining in the stream
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index c474339ba9ac..e0906ed24374 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -334,46 +334,6 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
 	} while ((len -= copy) != 0);
 }
 
-static unsigned int
-_shift_data_right_tail(struct xdr_buf *buf, unsigned int pgfrom, size_t len)
-{
-	struct kvec *tail = buf->tail;
-	unsigned int tailbuf_len;
-	unsigned int result = 0;
-	size_t copy;
-
-	tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len;
-
-	/* Shift the tail first */
-	if (tailbuf_len != 0) {
-		unsigned int free_space = tailbuf_len - tail->iov_len;
-
-		if (len < free_space)
-			free_space = len;
-		if (len > free_space)
-			len = free_space;
-
-		tail->iov_len += free_space;
-		copy = len;
-
-		if (tail->iov_len > len) {
-			char *p = (char *)tail->iov_base + len;
-			memmove(p, tail->iov_base, tail->iov_len - free_space);
-			result += tail->iov_len - free_space;
-		} else
-			copy = tail->iov_len;
-
-		/* Copy from the inlined pages into the tail */
-		_copy_from_pages((char *)tail->iov_base,
-					 buf->pages,
-					 buf->page_base + pgfrom,
-					 copy);
-		result += copy;
-	}
-
-	return result;
-}
-
 /**
  * _copy_to_pages
  * @pages: array of pages
@@ -464,18 +424,42 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 }
 EXPORT_SYMBOL_GPL(_copy_from_pages);
 
+static void xdr_buf_iov_zero(const struct kvec *iov, unsigned int base,
+			     unsigned int len)
+{
+	if (base >= iov->iov_len)
+		return;
+	if (len > iov->iov_len - base)
+		len = iov->iov_len - base;
+	memset(iov->iov_base + base, 0, len);
+}
+
 /**
- * _zero_pages
- * @pages: array of pages
- * @pgbase: beginning page vector address
+ * xdr_buf_pages_zero
+ * @buf: xdr_buf
+ * @pgbase: beginning offset
  * @len: length
  */
-static void
-_zero_pages(struct page **pages, size_t pgbase, size_t len)
+static void xdr_buf_pages_zero(const struct xdr_buf *buf, unsigned int pgbase,
+			       unsigned int len)
 {
+	struct page **pages = buf->pages;
 	struct page **page;
 	char *vpage;
-	size_t zero;
+	unsigned int zero;
+
+	if (!len)
+		return;
+	if (pgbase >= buf->page_len) {
+		xdr_buf_iov_zero(buf->tail, pgbase - buf->page_len, len);
+		return;
+	}
+	if (pgbase + len > buf->page_len) {
+		xdr_buf_iov_zero(buf->tail, 0, pgbase + len - buf->page_len);
+		len = buf->page_len - pgbase;
+	}
+
+	pgbase += buf->page_base;
 
 	page = pages + (pgbase >> PAGE_SHIFT);
 	pgbase &= ~PAGE_MASK;
@@ -496,6 +480,103 @@ _zero_pages(struct page **pages, size_t pgbase, size_t len)
 	} while ((len -= zero) != 0);
 }
 
+static void xdr_buf_try_expand(struct xdr_buf *buf, unsigned int len)
+{
+	struct kvec *head = buf->head;
+	struct kvec *tail = buf->tail;
+	unsigned int sum = head->iov_len + buf->page_len + tail->iov_len;
+	unsigned int free_space;
+
+	if (sum > buf->len) {
+		free_space = min_t(unsigned int, sum - buf->len, len);
+		buf->len += free_space;
+		len -= free_space;
+		if (!len)
+			return;
+	}
+
+	if (buf->buflen > sum) {
+		/* Expand the tail buffer */
+		free_space = min_t(unsigned int, buf->buflen - sum, len);
+		tail->iov_len += free_space;
+		buf->len += free_space;
+	}
+}
+
+static void xdr_buf_tail_copy_right(const struct xdr_buf *buf,
+				    unsigned int base, unsigned int len,
+				    unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+	unsigned int to = base + shift;
+
+	if (to >= tail->iov_len)
+		return;
+	if (len + to > tail->iov_len)
+		len = tail->iov_len - to;
+	memmove(tail->iov_base + to, tail->iov_base + base, len);
+}
+
+static void xdr_buf_pages_copy_right(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+	unsigned int to = base + shift;
+	unsigned int pglen = 0;
+	unsigned int talen = 0, tato = 0;
+
+	if (base >= buf->page_len)
+		return;
+	if (len > buf->page_len - base)
+		len = buf->page_len - base;
+	if (to >= buf->page_len) {
+		tato = to - buf->page_len;
+		if (tail->iov_len >= len + tato)
+			talen = len;
+		else if (tail->iov_len > tato)
+			talen = tail->iov_len - tato;
+	} else if (len + to >= buf->page_len) {
+		pglen = buf->page_len - to;
+		talen = len - pglen;
+		if (talen > tail->iov_len)
+			talen = tail->iov_len;
+	} else
+		pglen = len;
+
+	_copy_from_pages(tail->iov_base + tato, buf->pages,
+			 buf->page_base + base + pglen, talen);
+	_shift_data_right_pages(buf->pages, buf->page_base + to,
+				buf->page_base + base, pglen);
+}
+
+static void xdr_buf_tail_shift_right(const struct xdr_buf *buf,
+				     unsigned int base, unsigned int len,
+				     unsigned int shift)
+{
+	const struct kvec *tail = buf->tail;
+
+	if (base >= tail->iov_len || !shift || !len)
+		return;
+	xdr_buf_tail_copy_right(buf, base, len, shift);
+}
+
+static void xdr_buf_pages_shift_right(const struct xdr_buf *buf,
+				      unsigned int base, unsigned int len,
+				      unsigned int shift)
+{
+	if (!shift || !len)
+		return;
+	if (base >= buf->page_len) {
+		xdr_buf_tail_shift_right(buf, base - buf->page_len, len, shift);
+		return;
+	}
+	if (base + len > buf->page_len)
+		xdr_buf_tail_shift_right(buf, 0, base + len - buf->page_len,
+					 shift);
+	xdr_buf_pages_copy_right(buf, base, len, shift);
+}
+
 static void xdr_buf_tail_copy_left(const struct xdr_buf *buf, unsigned int base,
 				   unsigned int len, unsigned int shift)
 {
@@ -685,30 +766,33 @@ xdr_shrink_bufhead(struct xdr_buf *buf, size_t len)
 }
 
 /**
- * xdr_shrink_pagelen - shrinks buf->pages by up to @len bytes
+ * xdr_shrink_pagelen - shrinks buf->pages to @len bytes
  * @buf: xdr_buf
- * @len: bytes to remove from buf->pages
+ * @len: new page buffer length
  *
  * The extra data is not lost, but is instead moved into buf->tail.
  * Returns the actual number of bytes moved.
  */
-static unsigned int
-xdr_shrink_pagelen(struct xdr_buf *buf, size_t len)
+static unsigned int xdr_shrink_pagelen(struct xdr_buf *buf, unsigned int len)
 {
-	unsigned int pglen = buf->page_len;
-	unsigned int result;
-
-	if (len > buf->page_len)
-		len = buf-> page_len;
-
-	result = _shift_data_right_tail(buf, pglen - len, len);
-	buf->page_len -= len;
-	buf->buflen -= len;
-	/* Have we truncated the message? */
-	if (buf->len > buf->buflen)
-		buf->len = buf->buflen;
+	unsigned int shift, buflen = buf->len - buf->head->iov_len;
 
-	return result;
+	WARN_ON_ONCE(len > buf->page_len);
+	if (buf->head->iov_len >= buf->len || len > buflen)
+		buflen = len;
+	if (buf->page_len > buflen) {
+		buf->buflen -= buf->page_len - buflen;
+		buf->page_len = buflen;
+	}
+	if (len >= buf->page_len)
+		return 0;
+	shift = buf->page_len - len;
+	xdr_buf_try_expand(buf, shift);
+	xdr_buf_pages_shift_right(buf, len, buflen - len, shift);
+	buf->page_len = len;
+	buf->len -= shift;
+	buf->buflen -= shift;
+	return shift;
 }
 
 void
@@ -728,6 +812,18 @@ unsigned int xdr_stream_pos(const struct xdr_stream *xdr)
 }
 EXPORT_SYMBOL_GPL(xdr_stream_pos);
 
+static void xdr_stream_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+	unsigned int blen = xdr->buf->len;
+
+	xdr->nwords = blen > pos ? XDR_QUADLEN(blen) - XDR_QUADLEN(pos) : 0;
+}
+
+static void xdr_stream_page_set_pos(struct xdr_stream *xdr, unsigned int pos)
+{
+	xdr_stream_set_pos(xdr, pos + xdr->buf->head[0].iov_len);
+}
+
 /**
  * xdr_page_pos - Return the current offset from the start of the xdr pages
  * @xdr: pointer to struct xdr_stream
@@ -1291,7 +1387,7 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 	struct xdr_buf *buf = xdr->buf;
 	unsigned int nwords = XDR_QUADLEN(len);
 	unsigned int cur = xdr_stream_pos(xdr);
-	unsigned int copied, offset;
+	unsigned int copied;
 
 	if (xdr->nwords == 0)
 		return 0;
@@ -1305,9 +1401,8 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
 		len = buf->page_len;
 	else if (nwords < xdr->nwords) {
 		/* Truncate page data and move it into the tail */
-		offset = buf->page_len - len;
-		copied = xdr_shrink_pagelen(buf, offset);
-		trace_rpc_xdr_alignment(xdr, offset, copied);
+		copied = xdr_shrink_pagelen(buf, len);
+		trace_rpc_xdr_alignment(xdr, len, copied);
 		xdr->nwords = XDR_QUADLEN(buf->len - cur);
 	}
 	return len;
@@ -1387,39 +1482,28 @@ unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset,
 }
 EXPORT_SYMBOL_GPL(xdr_align_data);
 
-uint64_t xdr_expand_hole(struct xdr_stream *xdr, uint64_t offset, uint64_t length)
+unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset,
+			     unsigned int length)
 {
 	struct xdr_buf *buf = xdr->buf;
-	unsigned int bytes;
-	unsigned int from;
-	unsigned int truncated = 0;
-
-	if ((offset + length) < offset ||
-	    (offset + length) > buf->page_len)
-		length = buf->page_len - offset;
+	unsigned int from, to, shift;
 
 	xdr_realign_pages(xdr);
 	from = xdr_page_pos(xdr);
-	bytes = xdr_stream_remaining(xdr);
-
-	if (offset + length + bytes > buf->page_len) {
-		unsigned int shift = (offset + length + bytes) - buf->page_len;
-		unsigned int res = _shift_data_right_tail(buf, from + bytes - shift, shift);
-		truncated = shift - res;
-		xdr->nwords -= XDR_QUADLEN(truncated);
-		bytes -= shift;
-	}
-
-	/* Now move the page data over and zero pages */
-	if (bytes > 0)
-		_shift_data_right_pages(buf->pages,
-					buf->page_base + offset + length,
-					buf->page_base + from,
-					bytes);
-	_zero_pages(buf->pages, buf->page_base + offset, length);
-
-	buf->len += length - (from - offset) - truncated;
-	xdr_set_page(xdr, offset + length, xdr_stream_remaining(xdr));
+	to = xdr_align_size(offset + length);
+
+	/* Could the hole be behind us? */
+	if (to > from) {
+		unsigned int buflen = buf->len - buf->head->iov_len;
+		shift = to - from;
+		xdr_buf_try_expand(buf, shift);
+		xdr_buf_pages_shift_right(buf, from, buflen, shift);
+		xdr_stream_page_set_pos(xdr, to);
+	} else if (to != from)
+		xdr_align_data(xdr, to, 0);
+	xdr_buf_pages_zero(buf, offset, length);
+
+	xdr_set_page(xdr, to, xdr_stream_remaining(xdr));
 	return length;
 }
 EXPORT_SYMBOL_GPL(xdr_expand_hole);
-- 
cgit v1.2.3


From f8d0e60f1056687826abc1eded98f0ea067dfc4c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 8 Dec 2020 22:56:18 -0500
Subject: SUNRPC: Cleanup - constify a number of xdr_buf helpers

There are a number of xdr helpers for struct xdr_buf that do not change
the structure itself. Mark those as taking const pointers for
documentation purposes.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xdr.h | 22 +++++++++----------
 net/sunrpc/xdr.c           | 53 +++++++++++++++++++++-------------------------
 2 files changed, 35 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 178f499e2283..68d49fdc4ee9 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -128,8 +128,8 @@ __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *);
 
 void	xdr_inline_pages(struct xdr_buf *, unsigned int,
 			 struct page **, unsigned int, unsigned int);
-void	xdr_terminate_string(struct xdr_buf *, const u32);
-size_t	xdr_buf_pagecount(struct xdr_buf *buf);
+void	xdr_terminate_string(const struct xdr_buf *, const u32);
+size_t	xdr_buf_pagecount(const struct xdr_buf *buf);
 int	xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp);
 void	xdr_free_bvec(struct xdr_buf *buf);
 
@@ -182,14 +182,14 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p)
  * XDR buffer helper functions
  */
 extern void xdr_shift_buf(struct xdr_buf *, size_t);
-extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
-extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
+extern void xdr_buf_from_iov(const struct kvec *, struct xdr_buf *);
+extern int xdr_buf_subsegment(const struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
 extern void xdr_buf_trim(struct xdr_buf *, unsigned int);
-extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
-extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
+extern int read_bytes_from_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int);
+extern int write_bytes_to_xdr_buf(const struct xdr_buf *, unsigned int, void *, unsigned int);
 
-extern int xdr_encode_word(struct xdr_buf *, unsigned int, u32);
-extern int xdr_decode_word(struct xdr_buf *, unsigned int, u32 *);
+extern int xdr_encode_word(const struct xdr_buf *, unsigned int, u32);
+extern int xdr_decode_word(const struct xdr_buf *, unsigned int, u32 *);
 
 struct xdr_array2_desc;
 typedef int (*xdr_xcode_elem_t)(struct xdr_array2_desc *desc, void *elem);
@@ -200,9 +200,9 @@ struct xdr_array2_desc {
 	xdr_xcode_elem_t xcode;
 };
 
-extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
+extern int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base,
 			     struct xdr_array2_desc *desc);
-extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
+extern int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base,
 			     struct xdr_array2_desc *desc);
 extern void _copy_from_pages(char *p, struct page **pages, size_t pgbase,
 			     size_t len);
@@ -251,7 +251,7 @@ extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buf
 extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
 extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
-extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
+extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
 extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length);
 extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length);
 
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index f0444bf5617c..2e91fbd70f11 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -123,8 +123,7 @@ EXPORT_SYMBOL_GPL(xdr_decode_string_inplace);
  * @len: length of string, in bytes
  *
  */
-void
-xdr_terminate_string(struct xdr_buf *buf, const u32 len)
+void xdr_terminate_string(const struct xdr_buf *buf, const u32 len)
 {
 	char *kaddr;
 
@@ -134,8 +133,7 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len)
 }
 EXPORT_SYMBOL_GPL(xdr_terminate_string);
 
-size_t
-xdr_buf_pagecount(struct xdr_buf *buf)
+size_t xdr_buf_pagecount(const struct xdr_buf *buf)
 {
 	if (!buf->page_len)
 		return 0;
@@ -1545,8 +1543,7 @@ EXPORT_SYMBOL_GPL(xdr_enter_page);
 
 static const struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0};
 
-void
-xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
+void xdr_buf_from_iov(const struct kvec *iov, struct xdr_buf *buf)
 {
 	buf->head[0] = *iov;
 	buf->tail[0] = empty_iov;
@@ -1569,9 +1566,8 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
  *
  * Returns -1 if base of length are out of bounds.
  */
-int
-xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
-			unsigned int base, unsigned int len)
+int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf,
+		       unsigned int base, unsigned int len)
 {
 	subbuf->buflen = subbuf->len = len;
 	if (base < buf->head[0].iov_len) {
@@ -1659,7 +1655,8 @@ fix_len:
 }
 EXPORT_SYMBOL_GPL(xdr_buf_trim);
 
-static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __read_bytes_from_xdr_buf(const struct xdr_buf *subbuf,
+				      void *obj, unsigned int len)
 {
 	unsigned int this_len;
 
@@ -1676,7 +1673,8 @@ static void __read_bytes_from_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigne
 }
 
 /* obj is assumed to point to allocated memory of size at least len: */
-int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int read_bytes_from_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+			    void *obj, unsigned int len)
 {
 	struct xdr_buf subbuf;
 	int status;
@@ -1689,7 +1687,8 @@ int read_bytes_from_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, u
 }
 EXPORT_SYMBOL_GPL(read_bytes_from_xdr_buf);
 
-static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned int len)
+static void __write_bytes_to_xdr_buf(const struct xdr_buf *subbuf,
+				     void *obj, unsigned int len)
 {
 	unsigned int this_len;
 
@@ -1706,7 +1705,8 @@ static void __write_bytes_to_xdr_buf(struct xdr_buf *subbuf, void *obj, unsigned
 }
 
 /* obj is assumed to point to allocated memory of size at least len: */
-int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, unsigned int len)
+int write_bytes_to_xdr_buf(const struct xdr_buf *buf, unsigned int base,
+			   void *obj, unsigned int len)
 {
 	struct xdr_buf subbuf;
 	int status;
@@ -1719,8 +1719,7 @@ int write_bytes_to_xdr_buf(struct xdr_buf *buf, unsigned int base, void *obj, un
 }
 EXPORT_SYMBOL_GPL(write_bytes_to_xdr_buf);
 
-int
-xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
+int xdr_decode_word(const struct xdr_buf *buf, unsigned int base, u32 *obj)
 {
 	__be32	raw;
 	int	status;
@@ -1733,8 +1732,7 @@ xdr_decode_word(struct xdr_buf *buf, unsigned int base, u32 *obj)
 }
 EXPORT_SYMBOL_GPL(xdr_decode_word);
 
-int
-xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
+int xdr_encode_word(const struct xdr_buf *buf, unsigned int base, u32 obj)
 {
 	__be32	raw = cpu_to_be32(obj);
 
@@ -1743,9 +1741,8 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
 EXPORT_SYMBOL_GPL(xdr_encode_word);
 
 /* Returns 0 on success, or else a negative error code. */
-static int
-xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
-		 struct xdr_array2_desc *desc, int encode)
+static int xdr_xcode_array2(const struct xdr_buf *buf, unsigned int base,
+			    struct xdr_array2_desc *desc, int encode)
 {
 	char *elem = NULL, *c;
 	unsigned int copied = 0, todo, avail_here;
@@ -1937,9 +1934,8 @@ out:
 	return err;
 }
 
-int
-xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
-		  struct xdr_array2_desc *desc)
+int xdr_decode_array2(const struct xdr_buf *buf, unsigned int base,
+		      struct xdr_array2_desc *desc)
 {
 	if (base >= buf->len)
 		return -EINVAL;
@@ -1948,9 +1944,8 @@ xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
 }
 EXPORT_SYMBOL_GPL(xdr_decode_array2);
 
-int
-xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
-		  struct xdr_array2_desc *desc)
+int xdr_encode_array2(const struct xdr_buf *buf, unsigned int base,
+		      struct xdr_array2_desc *desc)
 {
 	if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
 	    buf->head->iov_len + buf->page_len + buf->tail->iov_len)
@@ -1960,9 +1955,9 @@ xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
 }
 EXPORT_SYMBOL_GPL(xdr_encode_array2);
 
-int
-xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
-		int (*actor)(struct scatterlist *, void *), void *data)
+int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset,
+		    unsigned int len,
+		    int (*actor)(struct scatterlist *, void *), void *data)
 {
 	int i, ret = 0;
 	unsigned int page_len, thislen, page_offset;
-- 
cgit v1.2.3


From 7c03e2cda4a584cadc398e8f6641ca9988a39d52 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Mon, 14 Dec 2020 15:26:13 +0100
Subject: vfs: move cap_convert_nscap() call into vfs_setxattr()

cap_convert_nscap() does permission checking as well as conversion of the
xattr value conditionally based on fs's user-ns.

This is needed by overlayfs and probably other layered fs (ecryptfs) and is
what vfs_foo() is supposed to do anyway.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Acked-by: James Morris <jamorris@linux.microsoft.com>
---
 fs/xattr.c                 | 17 +++++++++++------
 include/linux/capability.h |  2 +-
 security/commoncap.c       |  3 +--
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/xattr.c b/fs/xattr.c
index cd7a563e8bcd..fd57153b1f61 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -276,8 +276,16 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 {
 	struct inode *inode = dentry->d_inode;
 	struct inode *delegated_inode = NULL;
+	const void  *orig_value = value;
 	int error;
 
+	if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
+		error = cap_convert_nscap(dentry, &value, size);
+		if (error < 0)
+			return error;
+		size = error;
+	}
+
 retry_deleg:
 	inode_lock(inode);
 	error = __vfs_setxattr_locked(dentry, name, value, size, flags,
@@ -289,6 +297,9 @@ retry_deleg:
 		if (!error)
 			goto retry_deleg;
 	}
+	if (value != orig_value)
+		kfree(value);
+
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_setxattr);
@@ -537,12 +548,6 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
 		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
 			posix_acl_fix_xattr_from_user(kvalue, size);
-		else if (strcmp(kname, XATTR_NAME_CAPS) == 0) {
-			error = cap_convert_nscap(d, &kvalue, size);
-			if (error < 0)
-				goto out;
-			size = error;
-		}
 	}
 
 	error = vfs_setxattr(d, kname, kvalue, size, flags);
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 1e7fe311cabe..b2f698915c0f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -270,6 +270,6 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
 
-extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size);
+extern int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size);
 
 #endif /* !_LINUX_CAPABILITY_H */
diff --git a/security/commoncap.c b/security/commoncap.c
index 59bf3c1674c8..bacc1111d871 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -473,7 +473,7 @@ static bool validheader(size_t size, const struct vfs_cap_data *cap)
  *
  * If all is ok, we return the new size, on error return < 0.
  */
-int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
+int cap_convert_nscap(struct dentry *dentry, const void **ivalue, size_t size)
 {
 	struct vfs_ns_cap_data *nscap;
 	uid_t nsrootid;
@@ -516,7 +516,6 @@ int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
 	nscap->magic_etc = cpu_to_le32(nsmagic);
 	memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
 
-	kvfree(*ivalue);
 	*ivalue = nscap;
 	return newsize;
 }
-- 
cgit v1.2.3


From e0a6aa30504cb8179d07609fb6386705e8f00663 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Sun, 13 Dec 2020 09:39:40 +0100
Subject: efi: ia64: disable the capsule loader

EFI capsule loading is a feature that was introduced into EFI long after
its initial introduction on Itanium, and it is highly unlikely that IA64
systems are receiving firmware updates in the first place, let alone
using EFI capsules.

So let's disable capsule support altogether on IA64. This fixes a build
error on IA64 due to a recent change that added an unconditional
include of asm/efi.h, which IA64 does not provide.

While at it, tweak the make rules a bit so that the EFI capsule
component that is always builtin (even if the EFI capsule loader itself
is built as a module) is omitted for all architectures if the module is
not enabled in the build.

Cc: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/linux-efi/20201214152200.38353-1-ardb@kernel.org
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/Kconfig  |  2 +-
 drivers/firmware/efi/Makefile |  5 ++++-
 include/linux/efi.h           | 10 ++++------
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
index b452cfa2100b..5ac2a37ed025 100644
--- a/drivers/firmware/efi/Kconfig
+++ b/drivers/firmware/efi/Kconfig
@@ -147,7 +147,7 @@ config EFI_BOOTLOADER_CONTROL
 
 config EFI_CAPSULE_LOADER
 	tristate "EFI capsule loader"
-	depends on EFI
+	depends on EFI && !IA64
 	help
 	  This option exposes a loader interface "/dev/efi_capsule_loader" for
 	  users to load EFI capsules. This driver requires working runtime
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index d6ca2da19339..467e94259679 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -12,7 +12,10 @@ KASAN_SANITIZE_runtime-wrappers.o	:= n
 
 obj-$(CONFIG_ACPI_BGRT) 		+= efi-bgrt.o
 obj-$(CONFIG_EFI)			+= efi.o vars.o reboot.o memattr.o tpm.o
-obj-$(CONFIG_EFI)			+= capsule.o memmap.o
+obj-$(CONFIG_EFI)			+= memmap.o
+ifneq ($(CONFIG_EFI_CAPSULE_LOADER),)
+obj-$(CONFIG_EFI)			+= capsule.o
+endif
 obj-$(CONFIG_EFI_PARAMS_FROM_FDT)	+= fdtparams.o
 obj-$(CONFIG_EFI_VARS)			+= efivars.o
 obj-$(CONFIG_EFI_ESRT)			+= esrt.o
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 1cd5d91d8ca1..763b816ba19c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -817,12 +817,6 @@ static inline bool efi_enabled(int feature)
 static inline void
 efi_reboot(enum reboot_mode reboot_mode, const char *__unused) {}
 
-static inline bool
-efi_capsule_pending(int *reset_type)
-{
-	return false;
-}
-
 static inline bool efi_soft_reserve_enabled(void)
 {
 	return false;
@@ -1038,6 +1032,7 @@ bool efivar_validate(efi_guid_t vendor, efi_char16_t *var_name, u8 *data,
 bool efivar_variable_is_removable(efi_guid_t vendor, const char *name,
 				  size_t len);
 
+#if IS_ENABLED(CONFIG_EFI_CAPSULE_LOADER)
 extern bool efi_capsule_pending(int *reset_type);
 
 extern int efi_capsule_supported(efi_guid_t guid, u32 flags,
@@ -1045,6 +1040,9 @@ extern int efi_capsule_supported(efi_guid_t guid, u32 flags,
 
 extern int efi_capsule_update(efi_capsule_header_t *capsule,
 			      phys_addr_t *pages);
+#else
+static inline bool efi_capsule_pending(int *reset_type) { return false; }
+#endif
 
 #ifdef CONFIG_EFI_RUNTIME_MAP
 int efi_runtime_map_init(struct kobject *);
-- 
cgit v1.2.3


From 50c9132ddfb2024e96900407beeec660cf9848bd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 25 Sep 2020 07:55:39 -0400
Subject: ceph: add new RECOVER mount_state when recovering session

When recovering a session (a'la recover_session=clean), we want to do
all of the operations that we do on a forced umount, but changing the
mount state to SHUTDOWN is can cause queued MDS requests to fail when
the session comes back. Most of those can idle until the session is
recovered in this situation.

Reserve SHUTDOWN state for forced umount, and make a new RECOVER state
for the forced reconnect situation. Change several tests for equality with
SHUTDOWN to test for that or RECOVER.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/addr.c               |  4 ++--
 fs/ceph/caps.c               |  2 +-
 fs/ceph/inode.c              |  2 +-
 fs/ceph/mds_client.c         |  4 ++--
 fs/ceph/super.c              | 14 ++++++++++----
 include/linux/ceph/libceph.h |  1 +
 6 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 35c83f65475b..e10b07edc95c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -840,7 +840,7 @@ static int ceph_writepages_start(struct address_space *mapping,
 	     wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
 	     (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
 
-	if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 		if (ci->i_wrbuffer_ref > 0) {
 			pr_warn_ratelimited(
 				"writepage_start %p %lld forced umount\n",
@@ -1264,7 +1264,7 @@ ceph_find_incompatible(struct page *page)
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
-	if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 		dout(" page %p forced umount\n", page);
 		return ERR_PTR(-EIO);
 	}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8552d1082b0e..c74d8182fb48 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2747,7 +2747,7 @@ again:
 			goto out_unlock;
 		}
 
-		if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+		if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 			dout("get_cap_refs %p forced umount\n", inode);
 			ret = -EIO;
 			goto out_unlock;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 526faf4778ce..02b11a4a4d39 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1888,7 +1888,7 @@ static void ceph_do_invalidate_pages(struct inode *inode)
 
 	mutex_lock(&ci->i_truncate_mutex);
 
-	if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+	if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 		pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n",
 				    inode, ceph_ino(inode));
 		mapping_set_error(inode->i_mapping, -EIO);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 8f1d7500a7ec..a2d6ef808f70 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1595,7 +1595,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		struct ceph_cap_flush *cf;
 		struct ceph_mds_client *mdsc = fsc->mdsc;
 
-		if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
+		if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
 			if (inode->i_data.nrpages > 0)
 				invalidate = true;
 			if (ci->i_wrbuffer_ref > 0)
@@ -4678,7 +4678,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
 	u64 want_tid, want_flush;
 
-	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
+	if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
 		return;
 
 	dout("sync\n");
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 33ba6f0aa55c..9b1b7f4cfdd4 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -831,6 +831,13 @@ static void destroy_caches(void)
 	ceph_fscache_unregister();
 }
 
+static void __ceph_umount_begin(struct ceph_fs_client *fsc)
+{
+	ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
+	ceph_mdsc_force_umount(fsc->mdsc);
+	fsc->filp_gen++; // invalidate open files
+}
+
 /*
  * ceph_umount_begin - initiate forced umount.  Tear down the
  * mount, skipping steps that may hang while waiting for server(s).
@@ -843,9 +850,7 @@ static void ceph_umount_begin(struct super_block *sb)
 	if (!fsc)
 		return;
 	fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
-	ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
-	ceph_mdsc_force_umount(fsc->mdsc);
-	fsc->filp_gen++; // invalidate open files
+	__ceph_umount_begin(fsc);
 }
 
 static const struct super_operations ceph_super_ops = {
@@ -1234,7 +1239,8 @@ int ceph_force_reconnect(struct super_block *sb)
 	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 	int err = 0;
 
-	ceph_umount_begin(sb);
+	fsc->mount_state = CEPH_MOUNT_RECOVER;
+	__ceph_umount_begin(fsc);
 
 	/* Make sure all page caches get invalidated.
 	 * see remove_session_caps_cb() */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index c8645f0b797d..eb5a7ca13f9c 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -104,6 +104,7 @@ enum {
 	CEPH_MOUNT_UNMOUNTING,
 	CEPH_MOUNT_UNMOUNTED,
 	CEPH_MOUNT_SHUTDOWN,
+	CEPH_MOUNT_RECOVER,
 };
 
 static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
-- 
cgit v1.2.3


From 36c9478d6069994848c8897755b4380aa0a29dd3 Mon Sep 17 00:00:00 2001
From: "Liu, Changcheng" <changcheng.liu@aliyun.com>
Date: Tue, 10 Nov 2020 21:20:08 +0800
Subject: libceph: remove unused port macros

1. monitor's default port is defined by CEPH_MON_PORT
2. CEPH_PORT_START and CEPH_PORT_LAST are not needed.

Signed-off-by: Changcheng Liu <changcheng.liu@aliyun.com>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/msgr.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 9e50aede46c8..46939485f2c3 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -8,15 +8,6 @@
 
 #define CEPH_MON_PORT    6789  /* default monitor port */
 
-/*
- * client-side processes will try to bind to ports in this
- * range, simply for the benefit of tools like nmap or wireshark
- * that would like to identify the protocol.
- */
-#define CEPH_PORT_FIRST  6789
-#define CEPH_PORT_START  6800  /* non-monitors start here */
-#define CEPH_PORT_LAST   6900
-
 /*
  * tcp connection banner.  include a protocol version. and adjust
  * whenever the wire protocol changes.  try to keep this string length
-- 
cgit v1.2.3


From 968cd14edc3acff251f98bdc1eb15f13f05dd5fb Mon Sep 17 00:00:00 2001
From: Xiubo Li <xiubli@redhat.com>
Date: Wed, 9 Dec 2020 10:52:20 +0800
Subject: ceph: set osdmap epoch for setxattr

When setting the file/dir layout, it may need data pool info. So
in mds server, it needs to check the osdmap. At present, if mds
doesn't find the data pool specified, it will try to get the latest
osdmap. Now if pass the osd epoch for setxattr, the mds server can
only check this epoch of osdmap.

URL: https://tracker.ceph.com/issues/48504
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 2 +-
 fs/ceph/xattr.c              | 3 +++
 include/linux/ceph/ceph_fs.h | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 70d347989603..75034f7d8f46 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2533,7 +2533,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		goto out_free2;
 	}
 
-	msg->hdr.version = cpu_to_le16(2);
+	msg->hdr.version = cpu_to_le16(3);
 	msg->hdr.tid = cpu_to_le64(req->r_tid);
 
 	head = msg->front.iov_base;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index cd8c7aaa23a0..24997982de01 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1022,6 +1022,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_osd_client *osdc = &fsc->client->osdc;
 	struct ceph_pagelist *pagelist = NULL;
 	int op = CEPH_MDS_OP_SETXATTR;
 	int err;
@@ -1060,6 +1061,8 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
 
 	if (op == CEPH_MDS_OP_SETXATTR) {
 		req->r_args.setxattr.flags = cpu_to_le32(flags);
+		req->r_args.setxattr.osdmap_epoch =
+			cpu_to_le32(osdc->osdmap->epoch);
 		req->r_pagelist = pagelist;
 		pagelist = NULL;
 	}
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 455e9b9e2adf..c0f1b921ec69 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -424,6 +424,7 @@ union ceph_mds_request_args {
 	} __attribute__ ((packed)) open;
 	struct {
 		__le32 flags;
+		__le32 osdmap_epoch; /* used for setting file/dir layouts */
 	} __attribute__ ((packed)) setxattr;
 	struct {
 		struct ceph_file_layout_legacy layout;
-- 
cgit v1.2.3


From 4f1ddb1ea874c7703528a8c21b77b7f2462ee247 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 9 Dec 2020 10:12:59 -0500
Subject: ceph: implement updated ceph_mds_request_head structure

When we added the btime feature in mainline ceph, we had to extend
struct ceph_mds_request_args so that it could be set. Implement the same
in the kernel client.

Rename ceph_mds_request_head with a _old extension, and a union
ceph_mds_request_args_ext to allow for the extended size of the new
header format.

Add the appropriate code to handle both formats in struct
create_request_message and key the behavior on whether the peer supports
CEPH_FEATURE_FS_BTIME.

The gid_list field in the payload is now populated from the saved
credential. For now, we don't add any support for setting the btime via
setattr, but this does enable us to add that in the future.

[ idryomov: break unnecessarily long lines ]

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c         | 75 ++++++++++++++++++++++++++++++++++++--------
 include/linux/ceph/ceph_fs.h | 32 ++++++++++++++++++-
 2 files changed, 93 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 14d0a11b7d88..a256d95ec99a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2478,21 +2478,24 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 /*
  * called under mdsc->mutex
  */
-static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
+static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
 					       struct ceph_mds_request *req,
-					       int mds, bool drop_cap_releases)
+					       bool drop_cap_releases)
 {
+	int mds = session->s_mds;
+	struct ceph_mds_client *mdsc = session->s_mdsc;
 	struct ceph_msg *msg;
-	struct ceph_mds_request_head *head;
+	struct ceph_mds_request_head_old *head;
 	const char *path1 = NULL;
 	const char *path2 = NULL;
 	u64 ino1 = 0, ino2 = 0;
 	int pathlen1 = 0, pathlen2 = 0;
 	bool freepath1 = false, freepath2 = false;
-	int len;
+	int len, i;
 	u16 releases;
 	void *p, *end;
 	int ret;
+	bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
 
 	ret = set_request_path_attr(req->r_inode, req->r_dentry,
 			      req->r_parent, req->r_path1, req->r_ino1.ino,
@@ -2514,14 +2517,23 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		goto out_free1;
 	}
 
-	len = sizeof(*head) +
-		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+	if (legacy) {
+		/* Old style */
+		len = sizeof(*head);
+	} else {
+		/* New style: add gid_list and any later fields */
+		len = sizeof(struct ceph_mds_request_head) + sizeof(u32) +
+		      (sizeof(u64) * req->r_cred->group_info->ngroups);
+	}
+
+	len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
 		sizeof(struct ceph_timespec);
 
 	/* calculate (max) length for cap releases */
 	len += sizeof(struct ceph_mds_request_release) *
 		(!!req->r_inode_drop + !!req->r_dentry_drop +
 		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
+
 	if (req->r_dentry_drop)
 		len += pathlen1;
 	if (req->r_old_dentry_drop)
@@ -2533,11 +2545,25 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		goto out_free2;
 	}
 
-	msg->hdr.version = cpu_to_le16(3);
 	msg->hdr.tid = cpu_to_le64(req->r_tid);
 
-	head = msg->front.iov_base;
-	p = msg->front.iov_base + sizeof(*head);
+	/*
+	 * The old ceph_mds_request_header didn't contain a version field, and
+	 * one was added when we moved the message version from 3->4.
+	 */
+	if (legacy) {
+		msg->hdr.version = cpu_to_le16(3);
+		head = msg->front.iov_base;
+		p = msg->front.iov_base + sizeof(*head);
+	} else {
+		struct ceph_mds_request_head *new_head = msg->front.iov_base;
+
+		msg->hdr.version = cpu_to_le16(4);
+		new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+		head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
+		p = msg->front.iov_base + sizeof(*new_head);
+	}
+
 	end = msg->front.iov_base + msg->front.iov_len;
 
 	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
@@ -2590,6 +2616,14 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		ceph_encode_copy(&p, &ts, sizeof(ts));
 	}
 
+	/* gid list */
+	if (!legacy) {
+		ceph_encode_32(&p, req->r_cred->group_info->ngroups);
+		for (i = 0; i < req->r_cred->group_info->ngroups; i++)
+			ceph_encode_64(&p, from_kgid(&init_user_ns,
+				       req->r_cred->group_info->gid[i]));
+	}
+
 	if (WARN_ON_ONCE(p > end)) {
 		ceph_msg_put(msg);
 		msg = ERR_PTR(-ERANGE);
@@ -2633,6 +2667,18 @@ static void complete_request(struct ceph_mds_client *mdsc,
 	complete_all(&req->r_completion);
 }
 
+static struct ceph_mds_request_head_old *
+find_old_request_head(void *p, u64 features)
+{
+	bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
+	struct ceph_mds_request_head *new_head;
+
+	if (legacy)
+		return (struct ceph_mds_request_head_old *)p;
+	new_head = (struct ceph_mds_request_head *)p;
+	return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
+}
+
 /*
  * called under mdsc->mutex
  */
@@ -2642,7 +2688,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 {
 	int mds = session->s_mds;
 	struct ceph_mds_client *mdsc = session->s_mdsc;
-	struct ceph_mds_request_head *rhead;
+	struct ceph_mds_request_head_old *rhead;
 	struct ceph_msg *msg;
 	int flags = 0;
 
@@ -2661,6 +2707,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 
 	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
 		void *p;
+
 		/*
 		 * Replay.  Do not regenerate message (and rebuild
 		 * paths, etc.); just use the original message.
@@ -2668,7 +2715,8 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 		 * d_move mangles the src name.
 		 */
 		msg = req->r_request;
-		rhead = msg->front.iov_base;
+		rhead = find_old_request_head(msg->front.iov_base,
+					      session->s_con.peer_features);
 
 		flags = le32_to_cpu(rhead->flags);
 		flags |= CEPH_MDS_FLAG_REPLAY;
@@ -2699,14 +2747,15 @@ static int __prepare_send_request(struct ceph_mds_session *session,
 		ceph_msg_put(req->r_request);
 		req->r_request = NULL;
 	}
-	msg = create_request_message(mdsc, req, mds, drop_cap_releases);
+	msg = create_request_message(session, req, drop_cap_releases);
 	if (IS_ERR(msg)) {
 		req->r_err = PTR_ERR(msg);
 		return PTR_ERR(msg);
 	}
 	req->r_request = msg;
 
-	rhead = msg->front.iov_base;
+	rhead = find_old_request_head(msg->front.iov_base,
+				      session->s_con.peer_features);
 	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
 	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
 		flags |= CEPH_MDS_FLAG_REPLAY;
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index c0f1b921ec69..d44d98033d58 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -446,11 +446,25 @@ union ceph_mds_request_args {
 	} __attribute__ ((packed)) lookupino;
 } __attribute__ ((packed));
 
+union ceph_mds_request_args_ext {
+	union ceph_mds_request_args old;
+	struct {
+		__le32 mode;
+		__le32 uid;
+		__le32 gid;
+		struct ceph_timespec mtime;
+		struct ceph_timespec atime;
+		__le64 size, old_size;       /* old_size needed by truncate */
+		__le32 mask;                 /* CEPH_SETATTR_* */
+		struct ceph_timespec btime;
+	} __attribute__ ((packed)) setattr_ext;
+};
+
 #define CEPH_MDS_FLAG_REPLAY		1 /* this is a replayed op */
 #define CEPH_MDS_FLAG_WANT_DENTRY	2 /* want dentry in reply */
 #define CEPH_MDS_FLAG_ASYNC		4 /* request is asynchronous */
 
-struct ceph_mds_request_head {
+struct ceph_mds_request_head_old {
 	__le64 oldest_client_tid;
 	__le32 mdsmap_epoch;           /* on client */
 	__le32 flags;                  /* CEPH_MDS_FLAG_* */
@@ -463,6 +477,22 @@ struct ceph_mds_request_head {
 	union ceph_mds_request_args args;
 } __attribute__ ((packed));
 
+#define CEPH_MDS_REQUEST_HEAD_VERSION  1
+
+struct ceph_mds_request_head {
+	__le16 version;                /* struct version */
+	__le64 oldest_client_tid;
+	__le32 mdsmap_epoch;           /* on client */
+	__le32 flags;                  /* CEPH_MDS_FLAG_* */
+	__u8 num_retry, num_fwd;       /* count retry, fwd attempts */
+	__le16 num_releases;           /* # include cap/lease release records */
+	__le32 op;                     /* mds op code */
+	__le32 caller_uid, caller_gid;
+	__le64 ino;                    /* use this ino for openc, mkdir, mknod,
+					  etc. (if replaying) */
+	union ceph_mds_request_args_ext args;
+} __attribute__ ((packed));
+
 /* cap/lease release record */
 struct ceph_mds_request_release {
 	__le64 ino, cap_id;            /* ino and unique cap id */
-- 
cgit v1.2.3


From 418af5b3bfc4f1ef4854e83c5be8a0bdce51e95c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 29 Oct 2020 14:49:10 +0100
Subject: libceph: lower exponential backoff delay

The current setting allows the backoff to climb up to 5 minutes.  This
is too high -- it becomes hard to tell whether the client is stuck on
something or just in backoff.

In userspace, ms_max_backoff is defaulted to 15 seconds.  Let's do the
same.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  4 ++--
 net/ceph/messenger.c           | 12 +++++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 60b324efd1c4..b47c7cc4c90a 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -241,8 +241,8 @@ struct ceph_msg {
 };
 
 /* ceph connection fault delay defaults, for exponential backoff */
-#define BASE_DELAY_INTERVAL	(HZ/2)
-#define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
+#define BASE_DELAY_INTERVAL	(HZ / 4)
+#define MAX_DELAY_INTERVAL	(15 * HZ)
 
 /*
  * A single connection with another host.
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 214ae2d17a90..f3eb66bab988 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2812,6 +2812,9 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
 		return -ENOENT;
 	}
 
+	if (delay >= HZ)
+		delay = round_jiffies_relative(delay);
+
 	dout("%s %p %lu\n", __func__, con, delay);
 	if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
 		dout("%s %p - already queued\n", __func__, con);
@@ -2871,7 +2874,7 @@ static bool con_backoff(struct ceph_connection *con)
 	if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
 		return false;
 
-	ret = queue_con_delay(con, round_jiffies_relative(con->delay));
+	ret = queue_con_delay(con, con->delay);
 	if (ret) {
 		dout("%s: con %p FAILED to back off %lu\n", __func__,
 			con, con->delay);
@@ -3018,10 +3021,13 @@ static void con_fault(struct ceph_connection *con)
 	} else {
 		/* retry after a delay. */
 		con->state = CON_STATE_PREOPEN;
-		if (con->delay == 0)
+		if (!con->delay) {
 			con->delay = BASE_DELAY_INTERVAL;
-		else if (con->delay < MAX_DELAY_INTERVAL)
+		} else if (con->delay < MAX_DELAY_INTERVAL) {
 			con->delay *= 2;
+			if (con->delay > MAX_DELAY_INTERVAL)
+				con->delay = MAX_DELAY_INTERVAL;
+		}
 		con_flag_set(con, CON_FLAG_BACKOFF);
 		queue_con(con);
 	}
-- 
cgit v1.2.3


From 5cd8da3a1ca2160b8f9c2ff6a96762e66410ea38 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 13 Oct 2020 17:23:22 +0200
Subject: libceph: drop msg->ack_stamp field

It is set in process_ack() but never used.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 1 -
 net/ceph/messenger.c           | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index b47c7cc4c90a..6f77e70db855 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -235,7 +235,6 @@ struct ceph_msg {
 	bool more_to_follow;
 	bool needs_out_seq;
 	int front_alloc_len;
-	unsigned long ack_stamp;        /* tx: when we were acked */
 
 	struct ceph_msgpool *pool;
 };
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a6d93280d3e9..29b00b2cecac 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2279,7 +2279,6 @@ static void process_ack(struct ceph_connection *con)
 			break;
 		dout("got ack for seq %llu type %d at %p\n", seq,
 		     le16_to_cpu(m->hdr.type), m);
-		m->ack_stamp = jiffies;
 		ceph_msg_remove(m);
 	}
 
-- 
cgit v1.2.3


From 30be780a87211de75b93935c20a0913e46744a3f Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 14:11:26 +0100
Subject: libceph: make con->state an int

unsigned long is a leftover from when con->state used to be a set of
bits managed with set_bit(), clear_bit(), etc.  Save a bit of memory.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  2 +-
 net/ceph/messenger.c           | 16 ++++++----------
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 6f77e70db855..f053de4f46dd 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -257,13 +257,13 @@ struct ceph_connection {
 
 	struct ceph_messenger *msgr;
 
+	int state;
 	atomic_t sock_state;
 	struct socket *sock;
 	struct ceph_entity_addr peer_addr; /* peer address */
 	struct ceph_entity_addr peer_addr_for_me;
 
 	unsigned long flags;
-	unsigned long state;
 	const char *error_msg;  /* error message, if any */
 
 	struct ceph_entity_name peer_name; /* peer name */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index bb79a59daf42..9c92f101aa88 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -372,7 +372,7 @@ static void ceph_sock_data_ready(struct sock *sk)
 	}
 
 	if (sk->sk_state != TCP_CLOSE_WAIT) {
-		dout("%s on %p state = %lu, queueing work\n", __func__,
+		dout("%s %p state = %d, queueing work\n", __func__,
 		     con, con->state);
 		queue_con(con);
 	}
@@ -406,7 +406,7 @@ static void ceph_sock_state_change(struct sock *sk)
 {
 	struct ceph_connection *con = sk->sk_user_data;
 
-	dout("%s %p state = %lu sk_state = %u\n", __func__,
+	dout("%s %p state = %d sk_state = %u\n", __func__,
 	     con, con->state, sk->sk_state);
 
 	switch (sk->sk_state) {
@@ -2582,7 +2582,7 @@ static int try_write(struct ceph_connection *con)
 {
 	int ret = 1;
 
-	dout("try_write start %p state %lu\n", con, con->state);
+	dout("try_write start %p state %d\n", con, con->state);
 	if (con->state != CON_STATE_PREOPEN &&
 	    con->state != CON_STATE_CONNECTING &&
 	    con->state != CON_STATE_NEGOTIATING &&
@@ -2600,7 +2600,7 @@ static int try_write(struct ceph_connection *con)
 
 		BUG_ON(con->in_msg);
 		con->in_tag = CEPH_MSGR_TAG_READY;
-		dout("try_write initiating connect on %p new state %lu\n",
+		dout("try_write initiating connect on %p new state %d\n",
 		     con, con->state);
 		ret = ceph_tcp_connect(con);
 		if (ret < 0) {
@@ -2679,7 +2679,7 @@ static int try_read(struct ceph_connection *con)
 	int ret = -1;
 
 more:
-	dout("try_read start on %p state %lu\n", con, con->state);
+	dout("try_read start %p state %d\n", con, con->state);
 	if (con->state != CON_STATE_CONNECTING &&
 	    con->state != CON_STATE_NEGOTIATING &&
 	    con->state != CON_STATE_OPEN)
@@ -2876,11 +2876,7 @@ static bool con_sock_closed(struct ceph_connection *con)
 	CASE(OPEN);
 	CASE(STANDBY);
 	default:
-		pr_warn("%s con %p unrecognized state %lu\n",
-			__func__, con, con->state);
-		con->error_msg = "unrecognized con state";
 		BUG();
-		break;
 	}
 #undef CASE
 
@@ -2998,7 +2994,7 @@ static void ceph_con_workfn(struct work_struct *work)
  */
 static void con_fault(struct ceph_connection *con)
 {
-	dout("fault %p state %lu to peer %s\n",
+	dout("fault %p state %d to peer %s\n",
 	     con, con->state, ceph_pr_addr(&con->peer_addr));
 
 	pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
-- 
cgit v1.2.3


From 6d7f62bfb5b5da6b0b37174c1fd32545f3b5b90d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 14:59:02 +0100
Subject: libceph: rename and export con->state states

In preparation for msgr2, rename msgr1 specific states and move the
defines to the header file.

Also drop state transition comments.  They don't cover all possible
transitions (e.g. NEGOTIATING -> STANDBY, etc) and currently do more
harm than good.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 12 +++++-
 net/ceph/messenger.c           | 90 ++++++++++++++++++------------------------
 2 files changed, 50 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index f053de4f46dd..e6be85b67c6c 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -239,6 +239,16 @@ struct ceph_msg {
 	struct ceph_msgpool *pool;
 };
 
+/*
+ * connection states
+ */
+#define CEPH_CON_S_CLOSED		1
+#define CEPH_CON_S_PREOPEN		2
+#define CEPH_CON_S_V1_BANNER		3
+#define CEPH_CON_S_V1_CONNECT_MSG	4
+#define CEPH_CON_S_OPEN			5
+#define CEPH_CON_S_STANDBY		6
+
 /* ceph connection fault delay defaults, for exponential backoff */
 #define BASE_DELAY_INTERVAL	(HZ / 4)
 #define MAX_DELAY_INTERVAL	(15 * HZ)
@@ -257,7 +267,7 @@ struct ceph_connection {
 
 	struct ceph_messenger *msgr;
 
-	int state;
+	int state;  /* CEPH_CON_S_* */
 	atomic_t sock_state;
 	struct socket *sock;
 	struct ceph_entity_addr peer_addr; /* peer address */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9c92f101aa88..adeb69ba6747 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -82,16 +82,6 @@
 #define CON_SOCK_STATE_CONNECTED	3	/* -> CLOSING or -> CLOSED */
 #define CON_SOCK_STATE_CLOSING		4	/* -> CLOSED */
 
-/*
- * connection states
- */
-#define CON_STATE_CLOSED        1  /* -> PREOPEN */
-#define CON_STATE_PREOPEN       2  /* -> CONNECTING, CLOSED */
-#define CON_STATE_CONNECTING    3  /* -> NEGOTIATING, CLOSED */
-#define CON_STATE_NEGOTIATING   4  /* -> OPEN, CLOSED */
-#define CON_STATE_OPEN          5  /* -> STANDBY, CLOSED */
-#define CON_STATE_STANDBY       6  /* -> PREOPEN, CLOSED */
-
 /*
  * ceph_connection flag bits
  */
@@ -674,7 +664,7 @@ void ceph_con_close(struct ceph_connection *con)
 {
 	mutex_lock(&con->mutex);
 	dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
-	con->state = CON_STATE_CLOSED;
+	con->state = CEPH_CON_S_CLOSED;
 
 	con_flag_clear(con, CON_FLAG_LOSSYTX);	/* so we retry next connect */
 	con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
@@ -698,8 +688,8 @@ void ceph_con_open(struct ceph_connection *con,
 	mutex_lock(&con->mutex);
 	dout("con_open %p %s\n", con, ceph_pr_addr(addr));
 
-	WARN_ON(con->state != CON_STATE_CLOSED);
-	con->state = CON_STATE_PREOPEN;
+	WARN_ON(con->state != CEPH_CON_S_CLOSED);
+	con->state = CEPH_CON_S_PREOPEN;
 
 	con->peer_name.type = (__u8) entity_type;
 	con->peer_name.num = cpu_to_le64(entity_num);
@@ -739,7 +729,7 @@ void ceph_con_init(struct ceph_connection *con, void *private,
 	INIT_LIST_HEAD(&con->out_sent);
 	INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
 
-	con->state = CON_STATE_CLOSED;
+	con->state = CEPH_CON_S_CLOSED;
 }
 EXPORT_SYMBOL(ceph_con_init);
 
@@ -2183,7 +2173,7 @@ static int process_connect(struct ceph_connection *con)
 		if (con->ops->peer_reset)
 			con->ops->peer_reset(con);
 		mutex_lock(&con->mutex);
-		if (con->state != CON_STATE_NEGOTIATING)
+		if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
 			return -EAGAIN;
 		break;
 
@@ -2232,8 +2222,8 @@ static int process_connect(struct ceph_connection *con)
 			return -1;
 		}
 
-		WARN_ON(con->state != CON_STATE_NEGOTIATING);
-		con->state = CON_STATE_OPEN;
+		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
+		con->state = CEPH_CON_S_OPEN;
 		con->auth_retry = 0;    /* we authenticated; clear flag */
 		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
 		con->connect_seq++;
@@ -2583,16 +2573,16 @@ static int try_write(struct ceph_connection *con)
 	int ret = 1;
 
 	dout("try_write start %p state %d\n", con, con->state);
-	if (con->state != CON_STATE_PREOPEN &&
-	    con->state != CON_STATE_CONNECTING &&
-	    con->state != CON_STATE_NEGOTIATING &&
-	    con->state != CON_STATE_OPEN)
+	if (con->state != CEPH_CON_S_PREOPEN &&
+	    con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
 		return 0;
 
 	/* open the socket first? */
-	if (con->state == CON_STATE_PREOPEN) {
+	if (con->state == CEPH_CON_S_PREOPEN) {
 		BUG_ON(con->sock);
-		con->state = CON_STATE_CONNECTING;
+		con->state = CEPH_CON_S_V1_BANNER;
 
 		con_out_kvec_reset(con);
 		prepare_write_banner(con);
@@ -2646,7 +2636,7 @@ more:
 	}
 
 do_next:
-	if (con->state == CON_STATE_OPEN) {
+	if (con->state == CEPH_CON_S_OPEN) {
 		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
 			prepare_write_keepalive(con);
 			goto more;
@@ -2680,9 +2670,9 @@ static int try_read(struct ceph_connection *con)
 
 more:
 	dout("try_read start %p state %d\n", con, con->state);
-	if (con->state != CON_STATE_CONNECTING &&
-	    con->state != CON_STATE_NEGOTIATING &&
-	    con->state != CON_STATE_OPEN)
+	if (con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
 		return 0;
 
 	BUG_ON(!con->sock);
@@ -2690,8 +2680,7 @@ more:
 	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
 	     con->in_base_pos);
 
-	if (con->state == CON_STATE_CONNECTING) {
-		dout("try_read connecting\n");
+	if (con->state == CEPH_CON_S_V1_BANNER) {
 		ret = read_partial_banner(con);
 		if (ret <= 0)
 			goto out;
@@ -2699,7 +2688,7 @@ more:
 		if (ret < 0)
 			goto out;
 
-		con->state = CON_STATE_NEGOTIATING;
+		con->state = CEPH_CON_S_V1_CONNECT_MSG;
 
 		/*
 		 * Received banner is good, exchange connection info.
@@ -2715,8 +2704,7 @@ more:
 		goto out;
 	}
 
-	if (con->state == CON_STATE_NEGOTIATING) {
-		dout("try_read negotiating\n");
+	if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
 		ret = read_partial_connect(con);
 		if (ret <= 0)
 			goto out;
@@ -2726,7 +2714,7 @@ more:
 		goto more;
 	}
 
-	WARN_ON(con->state != CON_STATE_OPEN);
+	WARN_ON(con->state != CEPH_CON_S_OPEN);
 
 	if (con->in_base_pos < 0) {
 		/*
@@ -2760,7 +2748,7 @@ more:
 			break;
 		case CEPH_MSGR_TAG_CLOSE:
 			con_close_socket(con);
-			con->state = CON_STATE_CLOSED;
+			con->state = CEPH_CON_S_CLOSED;
 			goto out;
 		default:
 			goto bad_tag;
@@ -2785,7 +2773,7 @@ more:
 		if (con->in_tag == CEPH_MSGR_TAG_READY)
 			goto more;
 		process_message(con);
-		if (con->state == CON_STATE_OPEN)
+		if (con->state == CEPH_CON_S_OPEN)
 			prepare_read_tag(con);
 		goto more;
 	}
@@ -2864,15 +2852,15 @@ static bool con_sock_closed(struct ceph_connection *con)
 		return false;
 
 #define CASE(x)								\
-	case CON_STATE_ ## x:						\
+	case CEPH_CON_S_ ## x:						\
 		con->error_msg = "socket closed (con state " #x ")";	\
 		break;
 
 	switch (con->state) {
 	CASE(CLOSED);
 	CASE(PREOPEN);
-	CASE(CONNECTING);
-	CASE(NEGOTIATING);
+	CASE(V1_BANNER);
+	CASE(V1_CONNECT_MSG);
 	CASE(OPEN);
 	CASE(STANDBY);
 	default:
@@ -2943,16 +2931,16 @@ static void ceph_con_workfn(struct work_struct *work)
 			dout("%s: con %p BACKOFF\n", __func__, con);
 			break;
 		}
-		if (con->state == CON_STATE_STANDBY) {
+		if (con->state == CEPH_CON_S_STANDBY) {
 			dout("%s: con %p STANDBY\n", __func__, con);
 			break;
 		}
-		if (con->state == CON_STATE_CLOSED) {
+		if (con->state == CEPH_CON_S_CLOSED) {
 			dout("%s: con %p CLOSED\n", __func__, con);
 			BUG_ON(con->sock);
 			break;
 		}
-		if (con->state == CON_STATE_PREOPEN) {
+		if (con->state == CEPH_CON_S_PREOPEN) {
 			dout("%s: con %p PREOPEN\n", __func__, con);
 			BUG_ON(con->sock);
 		}
@@ -3001,15 +2989,15 @@ static void con_fault(struct ceph_connection *con)
 		ceph_pr_addr(&con->peer_addr), con->error_msg);
 	con->error_msg = NULL;
 
-	WARN_ON(con->state != CON_STATE_CONNECTING &&
-	       con->state != CON_STATE_NEGOTIATING &&
-	       con->state != CON_STATE_OPEN);
+	WARN_ON(con->state != CEPH_CON_S_V1_BANNER &&
+	       con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	       con->state != CEPH_CON_S_OPEN);
 
 	ceph_con_reset_protocol(con);
 
 	if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
-		con->state = CON_STATE_CLOSED;
+		con->state = CEPH_CON_S_CLOSED;
 		return;
 	}
 
@@ -3022,10 +3010,10 @@ static void con_fault(struct ceph_connection *con)
 	    !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
 		con_flag_clear(con, CON_FLAG_WRITE_PENDING);
-		con->state = CON_STATE_STANDBY;
+		con->state = CEPH_CON_S_STANDBY;
 	} else {
 		/* retry after a delay. */
-		con->state = CON_STATE_PREOPEN;
+		con->state = CEPH_CON_S_PREOPEN;
 		if (!con->delay) {
 			con->delay = BASE_DELAY_INTERVAL;
 		} else if (con->delay < MAX_DELAY_INTERVAL) {
@@ -3092,9 +3080,9 @@ static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
 static void clear_standby(struct ceph_connection *con)
 {
 	/* come back from STANDBY? */
-	if (con->state == CON_STATE_STANDBY) {
+	if (con->state == CEPH_CON_S_STANDBY) {
 		dout("clear_standby %p and ++connect_seq\n", con);
-		con->state = CON_STATE_PREOPEN;
+		con->state = CEPH_CON_S_PREOPEN;
 		con->connect_seq++;
 		WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
 		WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
@@ -3115,7 +3103,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	mutex_lock(&con->mutex);
 
-	if (con->state == CON_STATE_CLOSED) {
+	if (con->state == CEPH_CON_S_CLOSED) {
 		dout("con_send %p closed, dropping %p\n", con, msg);
 		ceph_msg_put(msg);
 		mutex_unlock(&con->mutex);
@@ -3456,7 +3444,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con,
 	mutex_unlock(&con->mutex);
 	msg = con->ops->alloc_msg(con, hdr, skip);
 	mutex_lock(&con->mutex);
-	if (con->state != CON_STATE_OPEN) {
+	if (con->state != CEPH_CON_S_OPEN) {
 		if (msg)
 			ceph_msg_put(msg);
 		return -EAGAIN;
-- 
cgit v1.2.3


From 3fefd43e741a5b8d55aeb9115ff488ad2cad439b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 14:56:36 +0100
Subject: libceph: rename and export con->flags bits

In preparation for msgr2, move the defines to the header file.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h | 13 ++++++-
 net/ceph/messenger.c           | 77 +++++++++++++++++++-----------------------
 2 files changed, 46 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index e6be85b67c6c..b6962a0fd76f 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -249,6 +249,17 @@ struct ceph_msg {
 #define CEPH_CON_S_OPEN			5
 #define CEPH_CON_S_STANDBY		6
 
+/*
+ * ceph_connection flag bits
+ */
+#define CEPH_CON_F_LOSSYTX		0  /* we can close channel or drop
+					      messages on errors */
+#define CEPH_CON_F_KEEPALIVE_PENDING	1  /* we need to send a keepalive */
+#define CEPH_CON_F_WRITE_PENDING	2  /* we have data ready to send */
+#define CEPH_CON_F_SOCK_CLOSED		3  /* socket state changed to closed */
+#define CEPH_CON_F_BACKOFF		4  /* need to retry queuing delayed
+					      work */
+
 /* ceph connection fault delay defaults, for exponential backoff */
 #define BASE_DELAY_INTERVAL	(HZ / 4)
 #define MAX_DELAY_INTERVAL	(15 * HZ)
@@ -273,7 +284,7 @@ struct ceph_connection {
 	struct ceph_entity_addr peer_addr; /* peer address */
 	struct ceph_entity_addr peer_addr_for_me;
 
-	unsigned long flags;
+	unsigned long flags;  /* CEPH_CON_F_* */
 	const char *error_msg;  /* error message, if any */
 
 	struct ceph_entity_name peer_name; /* peer name */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index adeb69ba6747..ee87dc3af959 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -82,24 +82,14 @@
 #define CON_SOCK_STATE_CONNECTED	3	/* -> CLOSING or -> CLOSED */
 #define CON_SOCK_STATE_CLOSING		4	/* -> CLOSED */
 
-/*
- * ceph_connection flag bits
- */
-#define CON_FLAG_LOSSYTX           0  /* we can close channel or drop
-				       * messages on errors */
-#define CON_FLAG_KEEPALIVE_PENDING 1  /* we need to send a keepalive */
-#define CON_FLAG_WRITE_PENDING	   2  /* we have data ready to send */
-#define CON_FLAG_SOCK_CLOSED	   3  /* socket state changed to closed */
-#define CON_FLAG_BACKOFF           4  /* need to retry queuing delayed work */
-
 static bool con_flag_valid(unsigned long con_flag)
 {
 	switch (con_flag) {
-	case CON_FLAG_LOSSYTX:
-	case CON_FLAG_KEEPALIVE_PENDING:
-	case CON_FLAG_WRITE_PENDING:
-	case CON_FLAG_SOCK_CLOSED:
-	case CON_FLAG_BACKOFF:
+	case CEPH_CON_F_LOSSYTX:
+	case CEPH_CON_F_KEEPALIVE_PENDING:
+	case CEPH_CON_F_WRITE_PENDING:
+	case CEPH_CON_F_SOCK_CLOSED:
+	case CEPH_CON_F_BACKOFF:
 		return true;
 	default:
 		return false;
@@ -380,7 +370,7 @@ static void ceph_sock_write_space(struct sock *sk)
 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
-	if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
+	if (con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
 		if (sk_stream_is_writeable(sk)) {
 			dout("%s %p queueing write work\n", __func__, con);
 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -406,7 +396,7 @@ static void ceph_sock_state_change(struct sock *sk)
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
 		con_sock_state_closing(con);
-		con_flag_set(con, CON_FLAG_SOCK_CLOSED);
+		con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
 		queue_con(con);
 		break;
 	case TCP_ESTABLISHED:
@@ -597,7 +587,7 @@ static int con_close_socket(struct ceph_connection *con)
 	 * received a socket close event before we had the chance to
 	 * shut the socket down.
 	 */
-	con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
+	con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
 
 	con_sock_state_closed(con);
 	return rc;
@@ -666,10 +656,10 @@ void ceph_con_close(struct ceph_connection *con)
 	dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
 	con->state = CEPH_CON_S_CLOSED;
 
-	con_flag_clear(con, CON_FLAG_LOSSYTX);	/* so we retry next connect */
-	con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
-	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
-	con_flag_clear(con, CON_FLAG_BACKOFF);
+	con_flag_clear(con, CEPH_CON_F_LOSSYTX);  /* so we retry next connect */
+	con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
+	con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	con_flag_clear(con, CEPH_CON_F_BACKOFF);
 
 	ceph_con_reset_protocol(con);
 	ceph_con_reset_session(con);
@@ -1365,7 +1355,7 @@ static void prepare_write_message(struct ceph_connection *con)
 		prepare_write_message_footer(con);
 	}
 
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1386,7 +1376,7 @@ static void prepare_write_ack(struct ceph_connection *con)
 				&con->out_temp_ack);
 
 	con->out_more = 1;  /* more will follow.. eventually.. */
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1404,7 +1394,7 @@ static void prepare_write_seq(struct ceph_connection *con)
 	con_out_kvec_add(con, sizeof (con->out_temp_ack),
 			 &con->out_temp_ack);
 
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1425,7 +1415,7 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 	} else {
 		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
 	}
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1464,7 +1454,7 @@ static void prepare_write_banner(struct ceph_connection *con)
 					&con->msgr->my_enc_addr);
 
 	con->out_more = 0;
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static void __prepare_write_connect(struct ceph_connection *con)
@@ -1475,7 +1465,7 @@ static void __prepare_write_connect(struct ceph_connection *con)
 				 con->auth->authorizer_buf);
 
 	con->out_more = 0;
-	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static int prepare_write_connect(struct ceph_connection *con)
@@ -2236,7 +2226,7 @@ static int process_connect(struct ceph_connection *con)
 			le32_to_cpu(con->in_reply.connect_seq));
 
 		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			con_flag_set(con, CON_FLAG_LOSSYTX);
+			con_flag_set(con, CEPH_CON_F_LOSSYTX);
 
 		con->delay = 0;      /* reset backoff memory */
 
@@ -2637,7 +2627,8 @@ more:
 
 do_next:
 	if (con->state == CEPH_CON_S_OPEN) {
-		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
+		if (con_flag_test_and_clear(con,
+				CEPH_CON_F_KEEPALIVE_PENDING)) {
 			prepare_write_keepalive(con);
 			goto more;
 		}
@@ -2653,7 +2644,7 @@ do_next:
 	}
 
 	/* Nothing to do! */
-	con_flag_clear(con, CON_FLAG_WRITE_PENDING);
+	con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
 	dout("try_write nothing else to write.\n");
 	ret = 0;
 out:
@@ -2848,7 +2839,7 @@ static void cancel_con(struct ceph_connection *con)
 
 static bool con_sock_closed(struct ceph_connection *con)
 {
-	if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
+	if (!con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
 		return false;
 
 #define CASE(x)								\
@@ -2875,7 +2866,7 @@ static bool con_backoff(struct ceph_connection *con)
 {
 	int ret;
 
-	if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
+	if (!con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
 		return false;
 
 	ret = queue_con_delay(con, con->delay);
@@ -2883,7 +2874,7 @@ static bool con_backoff(struct ceph_connection *con)
 		dout("%s: con %p FAILED to back off %lu\n", __func__,
 			con, con->delay);
 		BUG_ON(ret == -ENOENT);
-		con_flag_set(con, CON_FLAG_BACKOFF);
+		con_flag_set(con, CEPH_CON_F_BACKOFF);
 	}
 
 	return true;
@@ -2995,7 +2986,7 @@ static void con_fault(struct ceph_connection *con)
 
 	ceph_con_reset_protocol(con);
 
-	if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
+	if (con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
 		con->state = CEPH_CON_S_CLOSED;
 		return;
@@ -3007,9 +2998,9 @@ static void con_fault(struct ceph_connection *con)
 	/* If there are no messages queued or keepalive pending, place
 	 * the connection in a STANDBY state */
 	if (list_empty(&con->out_queue) &&
-	    !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
+	    !con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
-		con_flag_clear(con, CON_FLAG_WRITE_PENDING);
+		con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
 		con->state = CEPH_CON_S_STANDBY;
 	} else {
 		/* retry after a delay. */
@@ -3021,7 +3012,7 @@ static void con_fault(struct ceph_connection *con)
 			if (con->delay > MAX_DELAY_INTERVAL)
 				con->delay = MAX_DELAY_INTERVAL;
 		}
-		con_flag_set(con, CON_FLAG_BACKOFF);
+		con_flag_set(con, CEPH_CON_F_BACKOFF);
 		queue_con(con);
 	}
 }
@@ -3084,8 +3075,8 @@ static void clear_standby(struct ceph_connection *con)
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->state = CEPH_CON_S_PREOPEN;
 		con->connect_seq++;
-		WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
-		WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
+		WARN_ON(con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
+		WARN_ON(con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
 	}
 }
 
@@ -3126,7 +3117,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
-	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+	if (con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_send);
@@ -3222,10 +3213,10 @@ void ceph_con_keepalive(struct ceph_connection *con)
 	dout("con_keepalive %p\n", con);
 	mutex_lock(&con->mutex);
 	clear_standby(con);
-	con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING);
+	con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
 	mutex_unlock(&con->mutex);
 
-	if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+	if (con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING) == 0)
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
-- 
cgit v1.2.3


From 699921d9e68ff3d9f8645488c12f4689c6533d70 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 14:37:06 +0100
Subject: libceph: export zero_page

In preparation for msgr2, make zero_page global.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  1 +
 net/ceph/messenger.c           | 17 +++++++++--------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index b6962a0fd76f..513ed5f90bff 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -343,6 +343,7 @@ struct ceph_connection {
 	unsigned long       delay;          /* current delay interval */
 };
 
+extern struct page *ceph_zero_page;
 
 extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ee87dc3af959..d3880fbe8424 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -164,7 +164,7 @@ static void con_fault(struct ceph_connection *con);
 static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
 static atomic_t addr_str_seq = ATOMIC_INIT(0);
 
-static struct page *zero_page;		/* used in certain error cases */
+struct page *ceph_zero_page;		/* used in certain error cases */
 
 const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
 {
@@ -234,9 +234,9 @@ static void _ceph_msgr_exit(void)
 		ceph_msgr_wq = NULL;
 	}
 
-	BUG_ON(zero_page == NULL);
-	put_page(zero_page);
-	zero_page = NULL;
+	BUG_ON(!ceph_zero_page);
+	put_page(ceph_zero_page);
+	ceph_zero_page = NULL;
 
 	ceph_msgr_slab_exit();
 }
@@ -246,9 +246,9 @@ int __init ceph_msgr_init(void)
 	if (ceph_msgr_slab_init())
 		return -ENOMEM;
 
-	BUG_ON(zero_page != NULL);
-	zero_page = ZERO_PAGE(0);
-	get_page(zero_page);
+	BUG_ON(ceph_zero_page);
+	ceph_zero_page = ZERO_PAGE(0);
+	get_page(ceph_zero_page);
 
 	/*
 	 * The number of active work items is limited by the number of
@@ -1645,7 +1645,8 @@ static int write_partial_skip(struct ceph_connection *con)
 
 		if (size == con->out_skip)
 			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, more);
+		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
+					more);
 		if (ret <= 0)
 			goto out;
 		con->out_skip -= ret;
-- 
cgit v1.2.3


From 6503e0b69c9d4d78b5450db01e79328f8ed4ef21 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 9 Nov 2020 16:29:47 +0100
Subject: libceph: export remaining protocol independent infrastructure

In preparation for msgr2, make all protocol independent functions
in messenger.c global.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  39 +++++++++-
 net/ceph/messenger.c           | 157 ++++++++++++++++++++---------------------
 2 files changed, 113 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 513ed5f90bff..93815f1a42b5 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -345,13 +345,50 @@ struct ceph_connection {
 
 extern struct page *ceph_zero_page;
 
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag);
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag);
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag);
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+				  unsigned long con_flag);
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+				unsigned long con_flag);
+
+void ceph_encode_my_addr(struct ceph_messenger *msgr);
+
+int ceph_tcp_connect(struct ceph_connection *con);
+int ceph_con_close_socket(struct ceph_connection *con);
+void ceph_con_reset_session(struct ceph_connection *con);
+
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt);
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq);
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq);
+
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+			       struct ceph_msg *msg, size_t length);
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+				size_t *page_offset, size_t *length,
+				bool *last_piece);
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes);
+
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+		     unsigned int length);
+
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr);
+int ceph_addr_port(const struct ceph_entity_addr *addr);
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p);
+
+void ceph_con_process_message(struct ceph_connection *con);
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+			  struct ceph_msg_header *hdr, int *skip);
+void ceph_con_get_out_msg(struct ceph_connection *con);
+
+
 extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
 extern int ceph_parse_ips(const char *c, const char *end,
 			  struct ceph_entity_addr *addr,
 			  int max_count, int *count);
 
-
 extern int ceph_msgr_init(void);
 extern void ceph_msgr_exit(void);
 extern void ceph_msgr_flush(void);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d3880fbe8424..85d20372f923 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -96,37 +96,37 @@ static bool con_flag_valid(unsigned long con_flag)
 	}
 }
 
-static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	clear_bit(con_flag, &con->flags);
 }
 
-static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	set_bit(con_flag, &con->flags);
 }
 
-static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	return test_bit(con_flag, &con->flags);
 }
 
-static bool con_flag_test_and_clear(struct ceph_connection *con,
-					unsigned long con_flag)
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+				  unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
 	return test_and_clear_bit(con_flag, &con->flags);
 }
 
-static bool con_flag_test_and_set(struct ceph_connection *con,
-					unsigned long con_flag)
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+				unsigned long con_flag)
 {
 	BUG_ON(!con_flag_valid(con_flag));
 
@@ -199,7 +199,7 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
 }
 EXPORT_SYMBOL(ceph_pr_addr);
 
-static void encode_my_addr(struct ceph_messenger *msgr)
+void ceph_encode_my_addr(struct ceph_messenger *msgr)
 {
 	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
 	ceph_encode_banner_addr(&msgr->my_enc_addr);
@@ -370,7 +370,7 @@ static void ceph_sock_write_space(struct sock *sk)
 	 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
 	 * and net/core/stream.c:sk_stream_write_space().
 	 */
-	if (con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
+	if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
 		if (sk_stream_is_writeable(sk)) {
 			dout("%s %p queueing write work\n", __func__, con);
 			clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -396,7 +396,7 @@ static void ceph_sock_state_change(struct sock *sk)
 	case TCP_CLOSE_WAIT:
 		dout("%s TCP_CLOSE_WAIT\n", __func__);
 		con_sock_state_closing(con);
-		con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
+		ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
 		queue_con(con);
 		break;
 	case TCP_ESTABLISHED:
@@ -430,13 +430,15 @@ static void set_sock_callbacks(struct socket *sock,
 /*
  * initiate connection to a remote socket.
  */
-static int ceph_tcp_connect(struct ceph_connection *con)
+int ceph_tcp_connect(struct ceph_connection *con)
 {
 	struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */
 	struct socket *sock;
 	unsigned int noio_flag;
 	int ret;
 
+	dout("%s con %p peer_addr %s\n", __func__, con,
+	     ceph_pr_addr(&con->peer_addr));
 	BUG_ON(con->sock);
 
 	/* sock_create_kern() allocates with GFP_KERNEL */
@@ -454,8 +456,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 
 	set_sock_callbacks(sock, con);
 
-	dout("connect %s\n", ceph_pr_addr(&con->peer_addr));
-
 	con_sock_state_connecting(con);
 	ret = sock->ops->connect(sock, (struct sockaddr *)&ss, sizeof(ss),
 				 O_NONBLOCK);
@@ -570,11 +570,11 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
 /*
  * Shutdown/close the socket for the given connection.
  */
-static int con_close_socket(struct ceph_connection *con)
+int ceph_con_close_socket(struct ceph_connection *con)
 {
 	int rc = 0;
 
-	dout("con_close_socket on %p sock %p\n", con, con->sock);
+	dout("%s con %p sock %p\n", __func__, con, con->sock);
 	if (con->sock) {
 		rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
 		sock_release(con->sock);
@@ -587,7 +587,7 @@ static int con_close_socket(struct ceph_connection *con)
 	 * received a socket close event before we had the chance to
 	 * shut the socket down.
 	 */
-	con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
+	ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
 
 	con_sock_state_closed(con);
 	return rc;
@@ -597,7 +597,7 @@ static void ceph_con_reset_protocol(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
 
-	con_close_socket(con);
+	ceph_con_close_socket(con);
 	if (con->in_msg) {
 		WARN_ON(con->in_msg->con != con);
 		ceph_msg_put(con->in_msg);
@@ -631,7 +631,7 @@ static void ceph_msg_remove_list(struct list_head *head)
 	}
 }
 
-static void ceph_con_reset_session(struct ceph_connection *con)
+void ceph_con_reset_session(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
 
@@ -656,10 +656,11 @@ void ceph_con_close(struct ceph_connection *con)
 	dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
 	con->state = CEPH_CON_S_CLOSED;
 
-	con_flag_clear(con, CEPH_CON_F_LOSSYTX);  /* so we retry next connect */
-	con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
-	con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
-	con_flag_clear(con, CEPH_CON_F_BACKOFF);
+	ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX);  /* so we retry next
+							  connect */
+	ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF);
 
 	ceph_con_reset_protocol(con);
 	ceph_con_reset_session(con);
@@ -728,7 +729,7 @@ EXPORT_SYMBOL(ceph_con_init);
  * We maintain a global counter to order connection attempts.  Get
  * a unique seq greater than @gt.
  */
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt)
 {
 	u32 ret;
 
@@ -743,7 +744,7 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
 /*
  * Discard messages that have been acked by the server.
  */
-static void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
 {
 	struct ceph_msg *msg;
 	u64 seq;
@@ -768,8 +769,7 @@ static void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
  * reconnect_seq.  This avoids gratuitously resending messages that
  * the server had received and handled prior to reconnect.
  */
-static void ceph_con_discard_requeued(struct ceph_connection *con,
-				      u64 reconnect_seq)
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq)
 {
 	struct ceph_msg *msg;
 	u64 seq;
@@ -1150,8 +1150,8 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
 	cursor->need_crc = true;
 }
 
-static void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
-				      struct ceph_msg *msg, size_t length)
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+			       struct ceph_msg *msg, size_t length)
 {
 	BUG_ON(!length);
 	BUG_ON(length > msg->data_length);
@@ -1168,9 +1168,9 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
  * data item, and supply the page offset and length of that piece.
  * Indicate whether this is the last piece in this data item.
  */
-static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
-					size_t *page_offset, size_t *length,
-					bool *last_piece)
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+				size_t *page_offset, size_t *length,
+				bool *last_piece)
 {
 	struct page *page;
 
@@ -1209,8 +1209,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
  * Returns true if the result moves the cursor on to the next piece
  * of the data item.
  */
-static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
-				  size_t bytes)
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
 {
 	bool new_piece;
 
@@ -1284,8 +1283,6 @@ static void prepare_write_message_footer(struct ceph_connection *con)
 	con->out_msg_done = true;
 }
 
-static void ceph_con_get_out_msg(struct ceph_connection *con);
-
 /*
  * Prepare headers for the next outgoing message.
  */
@@ -1355,7 +1352,7 @@ static void prepare_write_message(struct ceph_connection *con)
 		prepare_write_message_footer(con);
 	}
 
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1376,7 +1373,7 @@ static void prepare_write_ack(struct ceph_connection *con)
 				&con->out_temp_ack);
 
 	con->out_more = 1;  /* more will follow.. eventually.. */
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1394,7 +1391,7 @@ static void prepare_write_seq(struct ceph_connection *con)
 	con_out_kvec_add(con, sizeof (con->out_temp_ack),
 			 &con->out_temp_ack);
 
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1415,7 +1412,7 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 	} else {
 		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
 	}
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 /*
@@ -1454,7 +1451,7 @@ static void prepare_write_banner(struct ceph_connection *con)
 					&con->msgr->my_enc_addr);
 
 	con->out_more = 0;
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static void __prepare_write_connect(struct ceph_connection *con)
@@ -1465,12 +1462,12 @@ static void __prepare_write_connect(struct ceph_connection *con)
 				 con->auth->authorizer_buf);
 
 	con->out_more = 0;
-	con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static int prepare_write_connect(struct ceph_connection *con)
 {
-	unsigned int global_seq = get_global_seq(con->msgr, 0);
+	unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
 	int proto;
 	int ret;
 
@@ -1549,9 +1546,8 @@ out:
 	return ret;  /* done! */
 }
 
-static u32 ceph_crc32c_page(u32 crc, struct page *page,
-				unsigned int page_offset,
-				unsigned int length)
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+		     unsigned int length)
 {
 	char *kaddr;
 
@@ -1813,7 +1809,7 @@ static int verify_hello(struct ceph_connection *con)
 	return 0;
 }
 
-static bool addr_is_blank(struct ceph_entity_addr *addr)
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr)
 {
 	struct sockaddr_storage ss = addr->in_addr; /* align */
 	struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr;
@@ -1829,7 +1825,7 @@ static bool addr_is_blank(struct ceph_entity_addr *addr)
 	}
 }
 
-static int addr_port(struct ceph_entity_addr *addr)
+int ceph_addr_port(const struct ceph_entity_addr *addr)
 {
 	switch (get_unaligned(&addr->in_addr.ss_family)) {
 	case AF_INET:
@@ -1840,7 +1836,7 @@ static int addr_port(struct ceph_entity_addr *addr)
 	return 0;
 }
 
-static void addr_set_port(struct ceph_entity_addr *addr, int p)
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p)
 {
 	switch (get_unaligned(&addr->in_addr.ss_family)) {
 	case AF_INET:
@@ -1998,7 +1994,7 @@ int ceph_parse_ips(const char *c, const char *end,
 			port = CEPH_MON_PORT;
 		}
 
-		addr_set_port(&addr[i], port);
+		ceph_addr_set_port(&addr[i], port);
 		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 
 		dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
@@ -2037,7 +2033,7 @@ static int process_banner(struct ceph_connection *con)
 	 */
 	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
 		   sizeof(con->peer_addr)) != 0 &&
-	    !(addr_is_blank(&con->actual_peer_addr) &&
+	    !(ceph_addr_is_blank(&con->actual_peer_addr) &&
 	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
 		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
 			ceph_pr_addr(&con->peer_addr),
@@ -2051,12 +2047,12 @@ static int process_banner(struct ceph_connection *con)
 	/*
 	 * did we learn our address?
 	 */
-	if (addr_is_blank(my_addr)) {
+	if (ceph_addr_is_blank(my_addr)) {
 		memcpy(&my_addr->in_addr,
 		       &con->peer_addr_for_me.in_addr,
 		       sizeof(con->peer_addr_for_me.in_addr));
-		addr_set_port(my_addr, 0);
-		encode_my_addr(con->msgr);
+		ceph_addr_set_port(my_addr, 0);
+		ceph_encode_my_addr(con->msgr);
 		dout("process_banner learned my addr is %s\n",
 		     ceph_pr_addr(my_addr));
 	}
@@ -2192,8 +2188,8 @@ static int process_connect(struct ceph_connection *con)
 		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
 		     con->peer_global_seq,
 		     le32_to_cpu(con->in_reply.global_seq));
-		get_global_seq(con->msgr,
-			       le32_to_cpu(con->in_reply.global_seq));
+		ceph_get_global_seq(con->msgr,
+				    le32_to_cpu(con->in_reply.global_seq));
 		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
@@ -2227,7 +2223,7 @@ static int process_connect(struct ceph_connection *con)
 			le32_to_cpu(con->in_reply.connect_seq));
 
 		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			con_flag_set(con, CEPH_CON_F_LOSSYTX);
+			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
 
 		con->delay = 0;      /* reset backoff memory */
 
@@ -2351,9 +2347,6 @@ static int read_partial_msg_data(struct ceph_connection *con)
 /*
  * read (part of) a message.
  */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con,
-				 struct ceph_msg_header *hdr, int *skip);
-
 static int read_partial_message(struct ceph_connection *con)
 {
 	struct ceph_msg *m = con->in_msg;
@@ -2515,7 +2508,7 @@ static int read_partial_message(struct ceph_connection *con)
  * be careful not to do anything that waits on other incoming messages or it
  * may deadlock.
  */
-static void process_message(struct ceph_connection *con)
+void ceph_con_process_message(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->in_msg;
 
@@ -2628,7 +2621,7 @@ more:
 
 do_next:
 	if (con->state == CEPH_CON_S_OPEN) {
-		if (con_flag_test_and_clear(con,
+		if (ceph_con_flag_test_and_clear(con,
 				CEPH_CON_F_KEEPALIVE_PENDING)) {
 			prepare_write_keepalive(con);
 			goto more;
@@ -2645,7 +2638,7 @@ do_next:
 	}
 
 	/* Nothing to do! */
-	con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
 	dout("try_write nothing else to write.\n");
 	ret = 0;
 out:
@@ -2739,7 +2732,7 @@ more:
 			prepare_read_keepalive_ack(con);
 			break;
 		case CEPH_MSGR_TAG_CLOSE:
-			con_close_socket(con);
+			ceph_con_close_socket(con);
 			con->state = CEPH_CON_S_CLOSED;
 			goto out;
 		default:
@@ -2764,7 +2757,7 @@ more:
 		}
 		if (con->in_tag == CEPH_MSGR_TAG_READY)
 			goto more;
-		process_message(con);
+		ceph_con_process_message(con);
 		if (con->state == CEPH_CON_S_OPEN)
 			prepare_read_tag(con);
 		goto more;
@@ -2840,7 +2833,7 @@ static void cancel_con(struct ceph_connection *con)
 
 static bool con_sock_closed(struct ceph_connection *con)
 {
-	if (!con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
+	if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
 		return false;
 
 #define CASE(x)								\
@@ -2867,7 +2860,7 @@ static bool con_backoff(struct ceph_connection *con)
 {
 	int ret;
 
-	if (!con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
+	if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
 		return false;
 
 	ret = queue_con_delay(con, con->delay);
@@ -2875,7 +2868,7 @@ static bool con_backoff(struct ceph_connection *con)
 		dout("%s: con %p FAILED to back off %lu\n", __func__,
 			con, con->delay);
 		BUG_ON(ret == -ENOENT);
-		con_flag_set(con, CEPH_CON_F_BACKOFF);
+		ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
 	}
 
 	return true;
@@ -2987,7 +2980,7 @@ static void con_fault(struct ceph_connection *con)
 
 	ceph_con_reset_protocol(con);
 
-	if (con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
+	if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
 		dout("fault on LOSSYTX channel, marking CLOSED\n");
 		con->state = CEPH_CON_S_CLOSED;
 		return;
@@ -2999,9 +2992,9 @@ static void con_fault(struct ceph_connection *con)
 	/* If there are no messages queued or keepalive pending, place
 	 * the connection in a STANDBY state */
 	if (list_empty(&con->out_queue) &&
-	    !con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
+	    !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
 		dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
-		con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+		ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
 		con->state = CEPH_CON_S_STANDBY;
 	} else {
 		/* retry after a delay. */
@@ -3013,7 +3006,7 @@ static void con_fault(struct ceph_connection *con)
 			if (con->delay > MAX_DELAY_INTERVAL)
 				con->delay = MAX_DELAY_INTERVAL;
 		}
-		con_flag_set(con, CEPH_CON_F_BACKOFF);
+		ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
 		queue_con(con);
 	}
 }
@@ -3023,7 +3016,7 @@ void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
 {
 	u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
 	msgr->inst.addr.nonce = cpu_to_le32(nonce);
-	encode_my_addr(msgr);
+	ceph_encode_my_addr(msgr);
 }
 
 /*
@@ -3037,7 +3030,7 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 	if (myaddr) {
 		memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr,
 		       sizeof(msgr->inst.addr.in_addr));
-		addr_set_port(&msgr->inst.addr, 0);
+		ceph_addr_set_port(&msgr->inst.addr, 0);
 	}
 
 	msgr->inst.addr.type = 0;
@@ -3047,7 +3040,7 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 		get_random_bytes(&msgr->inst.addr.nonce,
 				 sizeof(msgr->inst.addr.nonce));
 	} while (!msgr->inst.addr.nonce);
-	encode_my_addr(msgr);
+	ceph_encode_my_addr(msgr);
 
 	atomic_set(&msgr->stopping, 0);
 	write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
@@ -3076,8 +3069,8 @@ static void clear_standby(struct ceph_connection *con)
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->state = CEPH_CON_S_PREOPEN;
 		con->connect_seq++;
-		WARN_ON(con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
-		WARN_ON(con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
+		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
+		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
 	}
 }
 
@@ -3118,7 +3111,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 
 	/* if there wasn't anything waiting to send before, queue
 	 * new work */
-	if (con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING) == 0)
+	if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_send);
@@ -3214,10 +3207,10 @@ void ceph_con_keepalive(struct ceph_connection *con)
 	dout("con_keepalive %p\n", con);
 	mutex_lock(&con->mutex);
 	clear_standby(con);
-	con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
+	ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
 	mutex_unlock(&con->mutex);
 
-	if (con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING) == 0)
+	if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
 		queue_con(con);
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
@@ -3423,8 +3416,8 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
  * On error (ENOMEM, EAGAIN, ...),
  *  - con->in_msg == NULL
  */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con,
-				 struct ceph_msg_header *hdr, int *skip)
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+			  struct ceph_msg_header *hdr, int *skip)
 {
 	int middle_len = le32_to_cpu(hdr->middle_len);
 	struct ceph_msg *msg;
@@ -3470,7 +3463,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con,
 	return ret;
 }
 
-static void ceph_con_get_out_msg(struct ceph_connection *con)
+void ceph_con_get_out_msg(struct ceph_connection *con)
 {
 	struct ceph_msg *msg;
 
-- 
cgit v1.2.3


From 566050e17e53db283d4e26b73b4b50556f97ce7b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 12 Nov 2020 12:55:39 +0100
Subject: libceph: separate msgr1 protocol implementation

In preparation for msgr2, define internal messenger <-> protocol
interface (as opposed to external messenger <-> client interface, which
is struct ceph_connection_operations) consisting of try_read(),
try_write(), revoke(), revoke_incoming(), opened(), reset_session() and
reset_protocol() ops.  The semantics are exactly the same as they are
now.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |   8 +++
 net/ceph/messenger.c           | 138 ++++++++++++++++++++++++++---------------
 2 files changed, 96 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 93815f1a42b5..8cc8b08eb3dd 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -382,6 +382,14 @@ int ceph_con_in_msg_alloc(struct ceph_connection *con,
 			  struct ceph_msg_header *hdr, int *skip);
 void ceph_con_get_out_msg(struct ceph_connection *con);
 
+int ceph_con_v1_try_read(struct ceph_connection *con);
+int ceph_con_v1_try_write(struct ceph_connection *con);
+void ceph_con_v1_revoke(struct ceph_connection *con);
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con);
+bool ceph_con_v1_opened(struct ceph_connection *con);
+void ceph_con_v1_reset_session(struct ceph_connection *con);
+void ceph_con_v1_reset_protocol(struct ceph_connection *con);
+
 
 extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 85d20372f923..4ca7d9b594c7 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -593,6 +593,11 @@ int ceph_con_close_socket(struct ceph_connection *con)
 	return rc;
 }
 
+void ceph_con_v1_reset_protocol(struct ceph_connection *con)
+{
+	con->out_skip = 0;
+}
+
 static void ceph_con_reset_protocol(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
@@ -609,7 +614,7 @@ static void ceph_con_reset_protocol(struct ceph_connection *con)
 		con->out_msg = NULL;
 	}
 
-	con->out_skip = 0;
+	ceph_con_v1_reset_protocol(con);
 }
 
 /*
@@ -631,6 +636,12 @@ static void ceph_msg_remove_list(struct list_head *head)
 	}
 }
 
+void ceph_con_v1_reset_session(struct ceph_connection *con)
+{
+	con->connect_seq = 0;
+	con->peer_global_seq = 0;
+}
+
 void ceph_con_reset_session(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
@@ -643,8 +654,7 @@ void ceph_con_reset_session(struct ceph_connection *con)
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
 
-	con->connect_seq = 0;
-	con->peer_global_seq = 0;
+	ceph_con_v1_reset_session(con);
 }
 
 /*
@@ -692,12 +702,17 @@ void ceph_con_open(struct ceph_connection *con,
 }
 EXPORT_SYMBOL(ceph_con_open);
 
+bool ceph_con_v1_opened(struct ceph_connection *con)
+{
+	return con->connect_seq;
+}
+
 /*
  * return true if this connection ever successfully opened
  */
 bool ceph_con_opened(struct ceph_connection *con)
 {
-	return con->connect_seq > 0;
+	return ceph_con_v1_opened(con);
 }
 
 /*
@@ -2552,7 +2567,7 @@ static int read_keepalive_ack(struct ceph_connection *con)
  * Write something to the socket.  Called in a worker thread when the
  * socket appears to be writeable and we have something ready to send.
  */
-static int try_write(struct ceph_connection *con)
+int ceph_con_v1_try_write(struct ceph_connection *con)
 {
 	int ret = 1;
 
@@ -2649,7 +2664,7 @@ out:
 /*
  * Read what we can from the socket.
  */
-static int try_read(struct ceph_connection *con)
+int ceph_con_v1_try_read(struct ceph_connection *con)
 {
 	int ret = -1;
 
@@ -2930,7 +2945,7 @@ static void ceph_con_workfn(struct work_struct *work)
 			BUG_ON(con->sock);
 		}
 
-		ret = try_read(con);
+		ret = ceph_con_v1_try_read(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -2940,7 +2955,7 @@ static void ceph_con_workfn(struct work_struct *work)
 			break;
 		}
 
-		ret = try_write(con);
+		ret = ceph_con_v1_try_write(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -3116,6 +3131,29 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 }
 EXPORT_SYMBOL(ceph_con_send);
 
+void ceph_con_v1_revoke(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	WARN_ON(con->out_skip);
+	/* footer */
+	if (con->out_msg_done) {
+		con->out_skip += con_out_kvec_skip(con);
+	} else {
+		WARN_ON(!msg->data_length);
+		con->out_skip += sizeof_footer(con);
+	}
+	/* data, middle, front */
+	if (msg->data_length)
+		con->out_skip += msg->cursor.total_resid;
+	if (msg->middle)
+		con->out_skip += con_out_kvec_skip(con);
+	con->out_skip += con_out_kvec_skip(con);
+
+	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
+	     con->out_kvec_bytes, con->out_skip);
+}
+
 /*
  * Revoke a message that was previously queued for send
  */
@@ -3129,39 +3167,50 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 	}
 
 	mutex_lock(&con->mutex);
-	if (!list_empty(&msg->list_head)) {
-		dout("%s %p msg %p - was on queue\n", __func__, con, msg);
-		list_del_init(&msg->list_head);
-		msg->hdr.seq = 0;
-
-		ceph_msg_put(msg);
+	if (list_empty(&msg->list_head)) {
+		WARN_ON(con->out_msg == msg);
+		dout("%s con %p msg %p not linked\n", __func__, con, msg);
+		mutex_unlock(&con->mutex);
+		return;
 	}
-	if (con->out_msg == msg) {
-		BUG_ON(con->out_skip);
-		/* footer */
-		if (con->out_msg_done) {
-			con->out_skip += con_out_kvec_skip(con);
-		} else {
-			BUG_ON(!msg->data_length);
-			con->out_skip += sizeof_footer(con);
-		}
-		/* data, middle, front */
-		if (msg->data_length)
-			con->out_skip += msg->cursor.total_resid;
-		if (msg->middle)
-			con->out_skip += con_out_kvec_skip(con);
-		con->out_skip += con_out_kvec_skip(con);
 
-		dout("%s %p msg %p - was sending, will write %d skip %d\n",
-		     __func__, con, msg, con->out_kvec_bytes, con->out_skip);
-		msg->hdr.seq = 0;
+	dout("%s con %p msg %p was linked\n", __func__, con, msg);
+	msg->hdr.seq = 0;
+	ceph_msg_remove(msg);
+
+	if (con->out_msg == msg) {
+		WARN_ON(con->state != CEPH_CON_S_OPEN);
+		dout("%s con %p msg %p was sending\n", __func__, con, msg);
+		ceph_con_v1_revoke(con);
+		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
-		ceph_msg_put(msg);
+	} else {
+		dout("%s con %p msg %p not current, out_msg %p\n", __func__,
+		     con, msg, con->out_msg);
 	}
-
 	mutex_unlock(&con->mutex);
 }
 
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
+{
+	unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
+	unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
+
+	/* skip rest of message */
+	con->in_base_pos = con->in_base_pos -
+			sizeof(struct ceph_msg_header) -
+			front_len -
+			middle_len -
+			data_len -
+			sizeof(struct ceph_msg_footer);
+
+	con->in_tag = CEPH_MSGR_TAG_READY;
+	con->in_seq++;
+
+	dout("%s con %p in_base_pos %d\n", __func__, con, con->in_base_pos);
+}
+
 /*
  * Revoke a message that we may be reading data into
  */
@@ -3176,25 +3225,14 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
 
 	mutex_lock(&con->mutex);
 	if (con->in_msg == msg) {
-		unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
-		unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
-		unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
-
-		/* skip rest of message */
-		dout("%s %p msg %p revoked\n", __func__, con, msg);
-		con->in_base_pos = con->in_base_pos -
-				sizeof(struct ceph_msg_header) -
-				front_len -
-				middle_len -
-				data_len -
-				sizeof(struct ceph_msg_footer);
+		WARN_ON(con->state != CEPH_CON_S_OPEN);
+		dout("%s con %p msg %p was recving\n", __func__, con, msg);
+		ceph_con_v1_revoke_incoming(con);
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		con->in_seq++;
 	} else {
-		dout("%s %p in_msg %p msg %p no-op\n",
-		     __func__, con, con->in_msg, msg);
+		dout("%s con %p msg %p not current, in_msg %p\n", __func__,
+		     con, msg, con->in_msg);
 	}
 	mutex_unlock(&con->mutex);
 }
-- 
cgit v1.2.3


From 2f713615ddd9d805b6c5e79c52e0e11af99d2bf1 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 12 Nov 2020 15:48:06 +0100
Subject: libceph: move msgr1 protocol implementation to its own file

A pure move, no other changes.

Note that ceph_tcp_recv{msg,page}() and ceph_tcp_send{msg,page}()
helpers are also moved.  msgr2 will bring its own, more efficient,
variants based on iov_iter.  Switching msgr1 to them was considered
but decided against to avoid subtle regressions.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |    1 +
 net/ceph/Makefile              |    3 +-
 net/ceph/messenger.c           | 1495 ---------------------------------------
 net/ceph/messenger_v1.c        | 1502 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1505 insertions(+), 1496 deletions(-)
 create mode 100644 net/ceph/messenger_v1.c

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 8cc8b08eb3dd..b5268127c55e 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -382,6 +382,7 @@ int ceph_con_in_msg_alloc(struct ceph_connection *con,
 			  struct ceph_msg_header *hdr, int *skip);
 void ceph_con_get_out_msg(struct ceph_connection *con);
 
+/* messenger_v1.c */
 int ceph_con_v1_try_read(struct ceph_connection *con);
 int ceph_con_v1_try_write(struct ceph_connection *con);
 void ceph_con_v1_revoke(struct ceph_connection *con);
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index ce09bb4fb249..df02bd8d6c7b 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -14,4 +14,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	crypto.o armor.o \
 	auth_x.o \
 	ceph_strings.o ceph_hash.o \
-	pagevec.o snapshot.o string_table.o
+	pagevec.o snapshot.o string_table.o \
+	messenger_v1.o
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4ca7d9b594c7..544cfdbe52d6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -137,12 +137,6 @@ bool ceph_con_flag_test_and_set(struct ceph_connection *con,
 
 static struct kmem_cache	*ceph_msg_cache;
 
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
-
 #ifdef CONFIG_LOCKDEP
 static struct lock_class_key socket_class;
 #endif
@@ -477,96 +471,6 @@ int ceph_tcp_connect(struct ceph_connection *con)
 	return 0;
 }
 
-/*
- * If @buf is NULL, discard up to @len bytes.
- */
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
-	struct kvec iov = {buf, len};
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-	int r;
-
-	if (!buf)
-		msg.msg_flags |= MSG_TRUNC;
-
-	iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
-	r = sock_recvmsg(sock, &msg, msg.msg_flags);
-	if (r == -EAGAIN)
-		r = 0;
-	return r;
-}
-
-static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
-		     int page_offset, size_t length)
-{
-	struct bio_vec bvec = {
-		.bv_page = page,
-		.bv_offset = page_offset,
-		.bv_len = length
-	};
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-	int r;
-
-	BUG_ON(page_offset + length > PAGE_SIZE);
-	iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
-	r = sock_recvmsg(sock, &msg, msg.msg_flags);
-	if (r == -EAGAIN)
-		r = 0;
-	return r;
-}
-
-/*
- * write something.  @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
-			    size_t kvlen, size_t len, bool more)
-{
-	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
-	int r;
-
-	if (more)
-		msg.msg_flags |= MSG_MORE;
-	else
-		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
-
-	r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
-	if (r == -EAGAIN)
-		r = 0;
-	return r;
-}
-
-/*
- * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
- */
-static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
-			     int offset, size_t size, int more)
-{
-	ssize_t (*sendpage)(struct socket *sock, struct page *page,
-			    int offset, size_t size, int flags);
-	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
-	int ret;
-
-	/*
-	 * sendpage cannot properly handle pages with page_count == 0,
-	 * we need to fall back to sendmsg if that's the case.
-	 *
-	 * Same goes for slab pages: skb_can_coalesce() allows
-	 * coalescing neighboring slab objects into a single frag which
-	 * triggers one of hardened usercopy checks.
-	 */
-	if (sendpage_ok(page))
-		sendpage = sock->ops->sendpage;
-	else
-		sendpage = sock_no_sendpage;
-
-	ret = sendpage(sock, page, offset, size, flags);
-	if (ret == -EAGAIN)
-		ret = 0;
-
-	return ret;
-}
-
 /*
  * Shutdown/close the socket for the given connection.
  */
@@ -593,11 +497,6 @@ int ceph_con_close_socket(struct ceph_connection *con)
 	return rc;
 }
 
-void ceph_con_v1_reset_protocol(struct ceph_connection *con)
-{
-	con->out_skip = 0;
-}
-
 static void ceph_con_reset_protocol(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
@@ -636,12 +535,6 @@ static void ceph_msg_remove_list(struct list_head *head)
 	}
 }
 
-void ceph_con_v1_reset_session(struct ceph_connection *con)
-{
-	con->connect_seq = 0;
-	con->peer_global_seq = 0;
-}
-
 void ceph_con_reset_session(struct ceph_connection *con)
 {
 	dout("%s con %p\n", __func__, con);
@@ -702,11 +595,6 @@ void ceph_con_open(struct ceph_connection *con,
 }
 EXPORT_SYMBOL(ceph_con_open);
 
-bool ceph_con_v1_opened(struct ceph_connection *con)
-{
-	return con->connect_seq;
-}
-
 /*
  * return true if this connection ever successfully opened
  */
@@ -739,7 +627,6 @@ void ceph_con_init(struct ceph_connection *con, void *private,
 }
 EXPORT_SYMBOL(ceph_con_init);
 
-
 /*
  * We maintain a global counter to order connection attempts.  Get
  * a unique seq greater than @gt.
@@ -805,50 +692,6 @@ void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq)
 	}
 }
 
-static void con_out_kvec_reset(struct ceph_connection *con)
-{
-	BUG_ON(con->out_skip);
-
-	con->out_kvec_left = 0;
-	con->out_kvec_bytes = 0;
-	con->out_kvec_cur = &con->out_kvec[0];
-}
-
-static void con_out_kvec_add(struct ceph_connection *con,
-				size_t size, void *data)
-{
-	int index = con->out_kvec_left;
-
-	BUG_ON(con->out_skip);
-	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
-
-	con->out_kvec[index].iov_len = size;
-	con->out_kvec[index].iov_base = data;
-	con->out_kvec_left++;
-	con->out_kvec_bytes += size;
-}
-
-/*
- * Chop off a kvec from the end.  Return residual number of bytes for
- * that kvec, i.e. how many bytes would have been written if the kvec
- * hadn't been nuked.
- */
-static int con_out_kvec_skip(struct ceph_connection *con)
-{
-	int off = con->out_kvec_cur - con->out_kvec;
-	int skip = 0;
-
-	if (con->out_kvec_bytes > 0) {
-		skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
-		BUG_ON(con->out_kvec_bytes < skip);
-		BUG_ON(!con->out_kvec_left);
-		con->out_kvec_bytes -= skip;
-		con->out_kvec_left--;
-	}
-
-	return skip;
-}
-
 #ifdef CONFIG_BLOCK
 
 /*
@@ -1260,307 +1103,6 @@ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
 	cursor->need_crc = new_piece;
 }
 
-static size_t sizeof_footer(struct ceph_connection *con)
-{
-	return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
-	    sizeof(struct ceph_msg_footer) :
-	    sizeof(struct ceph_msg_footer_old);
-}
-
-static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
-{
-	/* Initialize data cursor */
-
-	ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
-}
-
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off.  Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con)
-{
-	struct ceph_msg *m = con->out_msg;
-
-	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
-
-	dout("prepare_write_message_footer %p\n", con);
-	con_out_kvec_add(con, sizeof_footer(con), &m->footer);
-	if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
-		if (con->ops->sign_message)
-			con->ops->sign_message(m);
-		else
-			m->footer.sig = 0;
-	} else {
-		m->old_footer.flags = m->footer.flags;
-	}
-	con->out_more = m->more_to_follow;
-	con->out_msg_done = true;
-}
-
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
-	struct ceph_msg *m;
-	u32 crc;
-
-	con_out_kvec_reset(con);
-	con->out_msg_done = false;
-
-	/* Sneak an ack in there first?  If we can get it into the same
-	 * TCP packet that's a good thing. */
-	if (con->in_seq > con->in_seq_acked) {
-		con->in_seq_acked = con->in_seq;
-		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-		con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			&con->out_temp_ack);
-	}
-
-	ceph_con_get_out_msg(con);
-	m = con->out_msg;
-
-	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
-	     m, con->out_seq, le16_to_cpu(m->hdr.type),
-	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-	     m->data_length);
-	WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
-	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
-
-	/* tag + hdr + front + middle */
-	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
-	con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
-	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
-
-	if (m->middle)
-		con_out_kvec_add(con, m->middle->vec.iov_len,
-			m->middle->vec.iov_base);
-
-	/* fill in hdr crc and finalize hdr */
-	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
-	con->out_msg->hdr.crc = cpu_to_le32(crc);
-	memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
-
-	/* fill in front and middle crc, footer */
-	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
-	con->out_msg->footer.front_crc = cpu_to_le32(crc);
-	if (m->middle) {
-		crc = crc32c(0, m->middle->vec.iov_base,
-				m->middle->vec.iov_len);
-		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
-	} else
-		con->out_msg->footer.middle_crc = 0;
-	dout("%s front_crc %u middle_crc %u\n", __func__,
-	     le32_to_cpu(con->out_msg->footer.front_crc),
-	     le32_to_cpu(con->out_msg->footer.middle_crc));
-	con->out_msg->footer.flags = 0;
-
-	/* is there a data payload? */
-	con->out_msg->footer.data_crc = 0;
-	if (m->data_length) {
-		prepare_message_data(con->out_msg, m->data_length);
-		con->out_more = 1;  /* data + footer will follow */
-	} else {
-		/* no, queue up footer too and be done */
-		prepare_write_message_footer(con);
-	}
-
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
-	dout("prepare_write_ack %p %llu -> %llu\n", con,
-	     con->in_seq_acked, con->in_seq);
-	con->in_seq_acked = con->in_seq;
-
-	con_out_kvec_reset(con);
-
-	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-				&con->out_temp_ack);
-
-	con->out_more = 1;  /* more will follow.. eventually.. */
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-/*
- * Prepare to share the seq during handshake
- */
-static void prepare_write_seq(struct ceph_connection *con)
-{
-	dout("prepare_write_seq %p %llu -> %llu\n", con,
-	     con->in_seq_acked, con->in_seq);
-	con->in_seq_acked = con->in_seq;
-
-	con_out_kvec_reset(con);
-
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			 &con->out_temp_ack);
-
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
-	dout("prepare_write_keepalive %p\n", con);
-	con_out_kvec_reset(con);
-	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
-		struct timespec64 now;
-
-		ktime_get_real_ts64(&now);
-		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
-		ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
-		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
-				 &con->out_temp_keepalive2);
-	} else {
-		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
-	}
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-/*
- * Connection negotiation.
- */
-
-static int get_connect_authorizer(struct ceph_connection *con)
-{
-	struct ceph_auth_handshake *auth;
-	int auth_proto;
-
-	if (!con->ops->get_authorizer) {
-		con->auth = NULL;
-		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
-		con->out_connect.authorizer_len = 0;
-		return 0;
-	}
-
-	auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
-	if (IS_ERR(auth))
-		return PTR_ERR(auth);
-
-	con->auth = auth;
-	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
-	con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
-	return 0;
-}
-
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_connection *con)
-{
-	con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
-	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
-					&con->msgr->my_enc_addr);
-
-	con->out_more = 0;
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-static void __prepare_write_connect(struct ceph_connection *con)
-{
-	con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
-	if (con->auth)
-		con_out_kvec_add(con, con->auth->authorizer_buf_len,
-				 con->auth->authorizer_buf);
-
-	con->out_more = 0;
-	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
-}
-
-static int prepare_write_connect(struct ceph_connection *con)
-{
-	unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
-	int proto;
-	int ret;
-
-	switch (con->peer_name.type) {
-	case CEPH_ENTITY_TYPE_MON:
-		proto = CEPH_MONC_PROTOCOL;
-		break;
-	case CEPH_ENTITY_TYPE_OSD:
-		proto = CEPH_OSDC_PROTOCOL;
-		break;
-	case CEPH_ENTITY_TYPE_MDS:
-		proto = CEPH_MDSC_PROTOCOL;
-		break;
-	default:
-		BUG();
-	}
-
-	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
-	     con->connect_seq, global_seq, proto);
-
-	con->out_connect.features =
-	    cpu_to_le64(from_msgr(con->msgr)->supported_features);
-	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
-	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-	con->out_connect.global_seq = cpu_to_le32(global_seq);
-	con->out_connect.protocol_version = cpu_to_le32(proto);
-	con->out_connect.flags = 0;
-
-	ret = get_connect_authorizer(con);
-	if (ret)
-		return ret;
-
-	__prepare_write_connect(con);
-	return 0;
-}
-
-/*
- * write as much of pending kvecs to the socket as we can.
- *  1 -> done
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
-	int ret;
-
-	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
-	while (con->out_kvec_bytes > 0) {
-		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
-				       con->out_kvec_left, con->out_kvec_bytes,
-				       con->out_more);
-		if (ret <= 0)
-			goto out;
-		con->out_kvec_bytes -= ret;
-		if (con->out_kvec_bytes == 0)
-			break;            /* done */
-
-		/* account for full iov entries consumed */
-		while (ret >= con->out_kvec_cur->iov_len) {
-			BUG_ON(!con->out_kvec_left);
-			ret -= con->out_kvec_cur->iov_len;
-			con->out_kvec_cur++;
-			con->out_kvec_left--;
-		}
-		/* and for a partially-consumed entry */
-		if (ret) {
-			con->out_kvec_cur->iov_len -= ret;
-			con->out_kvec_cur->iov_base += ret;
-		}
-	}
-	con->out_kvec_left = 0;
-	ret = 1;
-out:
-	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
-	     con->out_kvec_bytes, con->out_kvec_left, ret);
-	return ret;  /* done! */
-}
-
 u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
 		     unsigned int length)
 {
@@ -1573,256 +1115,6 @@ u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
 
 	return crc;
 }
-/*
- * Write as much message data payload as we can.  If we finish, queue
- * up the footer.
- *  1 -> done, footer is now queued in out_kvec[].
- *  0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_message_data(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->out_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->cursor;
-	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
-	u32 crc;
-
-	dout("%s %p msg %p\n", __func__, con, msg);
-
-	if (!msg->num_data_items)
-		return -EINVAL;
-
-	/*
-	 * Iterate through each page that contains data to be
-	 * written, and send as much as possible for each.
-	 *
-	 * If we are calculating the data crc (the default), we will
-	 * need to map the page.  If we have no pages, they have
-	 * been revoked, so use the zero page.
-	 */
-	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
-	while (cursor->total_resid) {
-		struct page *page;
-		size_t page_offset;
-		size_t length;
-		int ret;
-
-		if (!cursor->resid) {
-			ceph_msg_data_advance(cursor, 0);
-			continue;
-		}
-
-		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
-		if (length == cursor->total_resid)
-			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
-					more);
-		if (ret <= 0) {
-			if (do_datacrc)
-				msg->footer.data_crc = cpu_to_le32(crc);
-
-			return ret;
-		}
-		if (do_datacrc && cursor->need_crc)
-			crc = ceph_crc32c_page(crc, page, page_offset, length);
-		ceph_msg_data_advance(cursor, (size_t)ret);
-	}
-
-	dout("%s %p msg %p done\n", __func__, con, msg);
-
-	/* prepare and queue up footer, too */
-	if (do_datacrc)
-		msg->footer.data_crc = cpu_to_le32(crc);
-	else
-		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
-	con_out_kvec_reset(con);
-	prepare_write_message_footer(con);
-
-	return 1;	/* must return > 0 to indicate success */
-}
-
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
-	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
-	int ret;
-
-	dout("%s %p %d left\n", __func__, con, con->out_skip);
-	while (con->out_skip > 0) {
-		size_t size = min(con->out_skip, (int) PAGE_SIZE);
-
-		if (size == con->out_skip)
-			more = MSG_MORE;
-		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
-					more);
-		if (ret <= 0)
-			goto out;
-		con->out_skip -= ret;
-	}
-	ret = 1;
-out:
-	return ret;
-}
-
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
-	dout("prepare_read_banner %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_connect(struct ceph_connection *con)
-{
-	dout("prepare_read_connect %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_ack(struct ceph_connection *con)
-{
-	dout("prepare_read_ack %p\n", con);
-	con->in_base_pos = 0;
-}
-
-static void prepare_read_seq(struct ceph_connection *con)
-{
-	dout("prepare_read_seq %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_SEQ;
-}
-
-static void prepare_read_tag(struct ceph_connection *con)
-{
-	dout("prepare_read_tag %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_READY;
-}
-
-static void prepare_read_keepalive_ack(struct ceph_connection *con)
-{
-	dout("prepare_read_keepalive_ack %p\n", con);
-	con->in_base_pos = 0;
-}
-
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
-	dout("prepare_read_message %p\n", con);
-	BUG_ON(con->in_msg != NULL);
-	con->in_base_pos = 0;
-	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
-	return 0;
-}
-
-
-static int read_partial(struct ceph_connection *con,
-			int end, int size, void *object)
-{
-	while (con->in_base_pos < end) {
-		int left = end - con->in_base_pos;
-		int have = size - left;
-		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
-		if (ret <= 0)
-			return ret;
-		con->in_base_pos += ret;
-	}
-	return 1;
-}
-
-
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
-	int size;
-	int end;
-	int ret;
-
-	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-
-	/* peer's banner */
-	size = strlen(CEPH_BANNER);
-	end = size;
-	ret = read_partial(con, end, size, con->in_banner);
-	if (ret <= 0)
-		goto out;
-
-	size = sizeof (con->actual_peer_addr);
-	end += size;
-	ret = read_partial(con, end, size, &con->actual_peer_addr);
-	if (ret <= 0)
-		goto out;
-	ceph_decode_banner_addr(&con->actual_peer_addr);
-
-	size = sizeof (con->peer_addr_for_me);
-	end += size;
-	ret = read_partial(con, end, size, &con->peer_addr_for_me);
-	if (ret <= 0)
-		goto out;
-	ceph_decode_banner_addr(&con->peer_addr_for_me);
-
-out:
-	return ret;
-}
-
-static int read_partial_connect(struct ceph_connection *con)
-{
-	int size;
-	int end;
-	int ret;
-
-	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-
-	size = sizeof (con->in_reply);
-	end = size;
-	ret = read_partial(con, end, size, &con->in_reply);
-	if (ret <= 0)
-		goto out;
-
-	if (con->auth) {
-		size = le32_to_cpu(con->in_reply.authorizer_len);
-		if (size > con->auth->authorizer_reply_buf_len) {
-			pr_err("authorizer reply too big: %d > %zu\n", size,
-			       con->auth->authorizer_reply_buf_len);
-			ret = -EINVAL;
-			goto out;
-		}
-
-		end += size;
-		ret = read_partial(con, end, size,
-				   con->auth->authorizer_reply_buf);
-		if (ret <= 0)
-			goto out;
-	}
-
-	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
-	     con, (int)con->in_reply.tag,
-	     le32_to_cpu(con->in_reply.connect_seq),
-	     le32_to_cpu(con->in_reply.global_seq));
-out:
-	return ret;
-}
-
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
-	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
-		pr_err("connect to %s got bad banner\n",
-		       ceph_pr_addr(&con->peer_addr));
-		con->error_msg = "protocol error, bad banner";
-		return -1;
-	}
-	return 0;
-}
 
 bool ceph_addr_is_blank(const struct ceph_entity_addr *addr)
 {
@@ -2032,492 +1324,6 @@ bad:
 	return ret;
 }
 
-static int process_banner(struct ceph_connection *con)
-{
-	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
-
-	dout("process_banner on %p\n", con);
-
-	if (verify_hello(con) < 0)
-		return -1;
-
-	/*
-	 * Make sure the other end is who we wanted.  note that the other
-	 * end may not yet know their ip address, so if it's 0.0.0.0, give
-	 * them the benefit of the doubt.
-	 */
-	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
-		   sizeof(con->peer_addr)) != 0 &&
-	    !(ceph_addr_is_blank(&con->actual_peer_addr) &&
-	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
-			ceph_pr_addr(&con->peer_addr),
-			le32_to_cpu(con->peer_addr.nonce),
-			ceph_pr_addr(&con->actual_peer_addr),
-			le32_to_cpu(con->actual_peer_addr.nonce));
-		con->error_msg = "wrong peer at address";
-		return -1;
-	}
-
-	/*
-	 * did we learn our address?
-	 */
-	if (ceph_addr_is_blank(my_addr)) {
-		memcpy(&my_addr->in_addr,
-		       &con->peer_addr_for_me.in_addr,
-		       sizeof(con->peer_addr_for_me.in_addr));
-		ceph_addr_set_port(my_addr, 0);
-		ceph_encode_my_addr(con->msgr);
-		dout("process_banner learned my addr is %s\n",
-		     ceph_pr_addr(my_addr));
-	}
-
-	return 0;
-}
-
-static int process_connect(struct ceph_connection *con)
-{
-	u64 sup_feat = from_msgr(con->msgr)->supported_features;
-	u64 req_feat = from_msgr(con->msgr)->required_features;
-	u64 server_feat = le64_to_cpu(con->in_reply.features);
-	int ret;
-
-	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-
-	if (con->auth) {
-		int len = le32_to_cpu(con->in_reply.authorizer_len);
-
-		/*
-		 * Any connection that defines ->get_authorizer()
-		 * should also define ->add_authorizer_challenge() and
-		 * ->verify_authorizer_reply().
-		 *
-		 * See get_connect_authorizer().
-		 */
-		if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
-			ret = con->ops->add_authorizer_challenge(
-				    con, con->auth->authorizer_reply_buf, len);
-			if (ret < 0)
-				return ret;
-
-			con_out_kvec_reset(con);
-			__prepare_write_connect(con);
-			prepare_read_connect(con);
-			return 0;
-		}
-
-		if (len) {
-			ret = con->ops->verify_authorizer_reply(con);
-			if (ret < 0) {
-				con->error_msg = "bad authorize reply";
-				return ret;
-			}
-		}
-	}
-
-	switch (con->in_reply.tag) {
-	case CEPH_MSGR_TAG_FEATURES:
-		pr_err("%s%lld %s feature set mismatch,"
-		       " my %llx < server's %llx, missing %llx\n",
-		       ENTITY_NAME(con->peer_name),
-		       ceph_pr_addr(&con->peer_addr),
-		       sup_feat, server_feat, server_feat & ~sup_feat);
-		con->error_msg = "missing required protocol features";
-		return -1;
-
-	case CEPH_MSGR_TAG_BADPROTOVER:
-		pr_err("%s%lld %s protocol version mismatch,"
-		       " my %d != server's %d\n",
-		       ENTITY_NAME(con->peer_name),
-		       ceph_pr_addr(&con->peer_addr),
-		       le32_to_cpu(con->out_connect.protocol_version),
-		       le32_to_cpu(con->in_reply.protocol_version));
-		con->error_msg = "protocol version mismatch";
-		return -1;
-
-	case CEPH_MSGR_TAG_BADAUTHORIZER:
-		con->auth_retry++;
-		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
-		     con->auth_retry);
-		if (con->auth_retry == 2) {
-			con->error_msg = "connect authorization failure";
-			return -1;
-		}
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_RESETSESSION:
-		/*
-		 * If we connected with a large connect_seq but the peer
-		 * has no record of a session with us (no connection, or
-		 * connect_seq == 0), they will send RESETSESION to indicate
-		 * that they must have reset their session, and may have
-		 * dropped messages.
-		 */
-		dout("process_connect got RESET peer seq %u\n",
-		     le32_to_cpu(con->in_reply.connect_seq));
-		pr_info("%s%lld %s session reset\n",
-			ENTITY_NAME(con->peer_name),
-			ceph_pr_addr(&con->peer_addr));
-		ceph_con_reset_session(con);
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-
-		/* Tell ceph about it. */
-		mutex_unlock(&con->mutex);
-		if (con->ops->peer_reset)
-			con->ops->peer_reset(con);
-		mutex_lock(&con->mutex);
-		if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
-			return -EAGAIN;
-		break;
-
-	case CEPH_MSGR_TAG_RETRY_SESSION:
-		/*
-		 * If we sent a smaller connect_seq than the peer has, try
-		 * again with a larger value.
-		 */
-		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
-		     le32_to_cpu(con->out_connect.connect_seq),
-		     le32_to_cpu(con->in_reply.connect_seq));
-		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_RETRY_GLOBAL:
-		/*
-		 * If we sent a smaller global_seq than the peer has, try
-		 * again with a larger value.
-		 */
-		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.global_seq));
-		ceph_get_global_seq(con->msgr,
-				    le32_to_cpu(con->in_reply.global_seq));
-		con_out_kvec_reset(con);
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			return ret;
-		prepare_read_connect(con);
-		break;
-
-	case CEPH_MSGR_TAG_SEQ:
-	case CEPH_MSGR_TAG_READY:
-		if (req_feat & ~server_feat) {
-			pr_err("%s%lld %s protocol feature mismatch,"
-			       " my required %llx > server's %llx, need %llx\n",
-			       ENTITY_NAME(con->peer_name),
-			       ceph_pr_addr(&con->peer_addr),
-			       req_feat, server_feat, req_feat & ~server_feat);
-			con->error_msg = "missing required protocol features";
-			return -1;
-		}
-
-		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
-		con->state = CEPH_CON_S_OPEN;
-		con->auth_retry = 0;    /* we authenticated; clear flag */
-		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
-		con->connect_seq++;
-		con->peer_features = server_feat;
-		dout("process_connect got READY gseq %d cseq %d (%d)\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.connect_seq),
-		     con->connect_seq);
-		WARN_ON(con->connect_seq !=
-			le32_to_cpu(con->in_reply.connect_seq));
-
-		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
-			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
-
-		con->delay = 0;      /* reset backoff memory */
-
-		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
-			prepare_write_seq(con);
-			prepare_read_seq(con);
-		} else {
-			prepare_read_tag(con);
-		}
-		break;
-
-	case CEPH_MSGR_TAG_WAIT:
-		/*
-		 * If there is a connection race (we are opening
-		 * connections to each other), one of us may just have
-		 * to WAIT.  This shouldn't happen if we are the
-		 * client.
-		 */
-		con->error_msg = "protocol error, got WAIT as client";
-		return -1;
-
-	default:
-		con->error_msg = "protocol error, garbage tag during connect";
-		return -1;
-	}
-	return 0;
-}
-
-
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
-	int size = sizeof (con->in_temp_ack);
-	int end = size;
-
-	return read_partial(con, end, size, &con->in_temp_ack);
-}
-
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
-	u64 ack = le64_to_cpu(con->in_temp_ack);
-
-	if (con->in_tag == CEPH_MSGR_TAG_ACK)
-		ceph_con_discard_sent(con, ack);
-	else
-		ceph_con_discard_requeued(con, ack);
-
-	prepare_read_tag(con);
-}
-
-
-static int read_partial_message_section(struct ceph_connection *con,
-					struct kvec *section,
-					unsigned int sec_len, u32 *crc)
-{
-	int ret, left;
-
-	BUG_ON(!section);
-
-	while (section->iov_len < sec_len) {
-		BUG_ON(section->iov_base == NULL);
-		left = sec_len - section->iov_len;
-		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
-				       section->iov_len, left);
-		if (ret <= 0)
-			return ret;
-		section->iov_len += ret;
-	}
-	if (section->iov_len == sec_len)
-		*crc = crc32c(0, section->iov_base, section->iov_len);
-
-	return 1;
-}
-
-static int read_partial_msg_data(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->in_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->cursor;
-	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	struct page *page;
-	size_t page_offset;
-	size_t length;
-	u32 crc = 0;
-	int ret;
-
-	if (!msg->num_data_items)
-		return -EIO;
-
-	if (do_datacrc)
-		crc = con->in_data_crc;
-	while (cursor->total_resid) {
-		if (!cursor->resid) {
-			ceph_msg_data_advance(cursor, 0);
-			continue;
-		}
-
-		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
-		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
-		if (ret <= 0) {
-			if (do_datacrc)
-				con->in_data_crc = crc;
-
-			return ret;
-		}
-
-		if (do_datacrc)
-			crc = ceph_crc32c_page(crc, page, page_offset, ret);
-		ceph_msg_data_advance(cursor, (size_t)ret);
-	}
-	if (do_datacrc)
-		con->in_data_crc = crc;
-
-	return 1;	/* must return > 0 to indicate success */
-}
-
-/*
- * read (part of) a message.
- */
-static int read_partial_message(struct ceph_connection *con)
-{
-	struct ceph_msg *m = con->in_msg;
-	int size;
-	int end;
-	int ret;
-	unsigned int front_len, middle_len, data_len;
-	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
-	bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
-	u64 seq;
-	u32 crc;
-
-	dout("read_partial_message con %p msg %p\n", con, m);
-
-	/* header */
-	size = sizeof (con->in_hdr);
-	end = size;
-	ret = read_partial(con, end, size, &con->in_hdr);
-	if (ret <= 0)
-		return ret;
-
-	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
-	if (cpu_to_le32(crc) != con->in_hdr.crc) {
-		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
-		       crc, con->in_hdr.crc);
-		return -EBADMSG;
-	}
-
-	front_len = le32_to_cpu(con->in_hdr.front_len);
-	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
-		return -EIO;
-	middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
-		return -EIO;
-	data_len = le32_to_cpu(con->in_hdr.data_len);
-	if (data_len > CEPH_MSG_MAX_DATA_LEN)
-		return -EIO;
-
-	/* verify seq# */
-	seq = le64_to_cpu(con->in_hdr.seq);
-	if ((s64)seq - (s64)con->in_seq < 1) {
-		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
-			ENTITY_NAME(con->peer_name),
-			ceph_pr_addr(&con->peer_addr),
-			seq, con->in_seq + 1);
-		con->in_base_pos = -front_len - middle_len - data_len -
-			sizeof_footer(con);
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		return 1;
-	} else if ((s64)seq - (s64)con->in_seq > 1) {
-		pr_err("read_partial_message bad seq %lld expected %lld\n",
-		       seq, con->in_seq + 1);
-		con->error_msg = "bad message sequence # for incoming message";
-		return -EBADE;
-	}
-
-	/* allocate message? */
-	if (!con->in_msg) {
-		int skip = 0;
-
-		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
-		     front_len, data_len);
-		ret = ceph_con_in_msg_alloc(con, &con->in_hdr, &skip);
-		if (ret < 0)
-			return ret;
-
-		BUG_ON(!con->in_msg ^ skip);
-		if (skip) {
-			/* skip this message */
-			dout("alloc_msg said skip message\n");
-			con->in_base_pos = -front_len - middle_len - data_len -
-				sizeof_footer(con);
-			con->in_tag = CEPH_MSGR_TAG_READY;
-			con->in_seq++;
-			return 1;
-		}
-
-		BUG_ON(!con->in_msg);
-		BUG_ON(con->in_msg->con != con);
-		m = con->in_msg;
-		m->front.iov_len = 0;    /* haven't read it yet */
-		if (m->middle)
-			m->middle->vec.iov_len = 0;
-
-		/* prepare for data payload, if any */
-
-		if (data_len)
-			prepare_message_data(con->in_msg, data_len);
-	}
-
-	/* front */
-	ret = read_partial_message_section(con, &m->front, front_len,
-					   &con->in_front_crc);
-	if (ret <= 0)
-		return ret;
-
-	/* middle */
-	if (m->middle) {
-		ret = read_partial_message_section(con, &m->middle->vec,
-						   middle_len,
-						   &con->in_middle_crc);
-		if (ret <= 0)
-			return ret;
-	}
-
-	/* (page) data */
-	if (data_len) {
-		ret = read_partial_msg_data(con);
-		if (ret <= 0)
-			return ret;
-	}
-
-	/* footer */
-	size = sizeof_footer(con);
-	end += size;
-	ret = read_partial(con, end, size, &m->footer);
-	if (ret <= 0)
-		return ret;
-
-	if (!need_sign) {
-		m->footer.flags = m->old_footer.flags;
-		m->footer.sig = 0;
-	}
-
-	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
-	     m, front_len, m->footer.front_crc, middle_len,
-	     m->footer.middle_crc, data_len, m->footer.data_crc);
-
-	/* crc ok? */
-	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
-		pr_err("read_partial_message %p front crc %u != exp. %u\n",
-		       m, con->in_front_crc, m->footer.front_crc);
-		return -EBADMSG;
-	}
-	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
-		pr_err("read_partial_message %p middle crc %u != exp %u\n",
-		       m, con->in_middle_crc, m->footer.middle_crc);
-		return -EBADMSG;
-	}
-	if (do_datacrc &&
-	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
-	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
-		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
-		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
-		return -EBADMSG;
-	}
-
-	if (need_sign && con->ops->check_message_signature &&
-	    con->ops->check_message_signature(m)) {
-		pr_err("read_partial_message %p signature check failed\n", m);
-		return -EBADMSG;
-	}
-
-	return 1; /* done! */
-}
-
 /*
  * Process message.  This happens in the worker thread.  The callback should
  * be careful not to do anything that waits on other incoming messages or it
@@ -2551,263 +1357,6 @@ void ceph_con_process_message(struct ceph_connection *con)
 	mutex_lock(&con->mutex);
 }
 
-static int read_keepalive_ack(struct ceph_connection *con)
-{
-	struct ceph_timespec ceph_ts;
-	size_t size = sizeof(ceph_ts);
-	int ret = read_partial(con, size, size, &ceph_ts);
-	if (ret <= 0)
-		return ret;
-	ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
-	prepare_read_tag(con);
-	return 1;
-}
-
-/*
- * Write something to the socket.  Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-int ceph_con_v1_try_write(struct ceph_connection *con)
-{
-	int ret = 1;
-
-	dout("try_write start %p state %d\n", con, con->state);
-	if (con->state != CEPH_CON_S_PREOPEN &&
-	    con->state != CEPH_CON_S_V1_BANNER &&
-	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
-	    con->state != CEPH_CON_S_OPEN)
-		return 0;
-
-	/* open the socket first? */
-	if (con->state == CEPH_CON_S_PREOPEN) {
-		BUG_ON(con->sock);
-		con->state = CEPH_CON_S_V1_BANNER;
-
-		con_out_kvec_reset(con);
-		prepare_write_banner(con);
-		prepare_read_banner(con);
-
-		BUG_ON(con->in_msg);
-		con->in_tag = CEPH_MSGR_TAG_READY;
-		dout("try_write initiating connect on %p new state %d\n",
-		     con, con->state);
-		ret = ceph_tcp_connect(con);
-		if (ret < 0) {
-			con->error_msg = "connect error";
-			goto out;
-		}
-	}
-
-more:
-	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
-	BUG_ON(!con->sock);
-
-	/* kvec data queued? */
-	if (con->out_kvec_left) {
-		ret = write_partial_kvec(con);
-		if (ret <= 0)
-			goto out;
-	}
-	if (con->out_skip) {
-		ret = write_partial_skip(con);
-		if (ret <= 0)
-			goto out;
-	}
-
-	/* msg pages? */
-	if (con->out_msg) {
-		if (con->out_msg_done) {
-			ceph_msg_put(con->out_msg);
-			con->out_msg = NULL;   /* we're done with this one */
-			goto do_next;
-		}
-
-		ret = write_partial_message_data(con);
-		if (ret == 1)
-			goto more;  /* we need to send the footer, too! */
-		if (ret == 0)
-			goto out;
-		if (ret < 0) {
-			dout("try_write write_partial_message_data err %d\n",
-			     ret);
-			goto out;
-		}
-	}
-
-do_next:
-	if (con->state == CEPH_CON_S_OPEN) {
-		if (ceph_con_flag_test_and_clear(con,
-				CEPH_CON_F_KEEPALIVE_PENDING)) {
-			prepare_write_keepalive(con);
-			goto more;
-		}
-		/* is anything else pending? */
-		if (!list_empty(&con->out_queue)) {
-			prepare_write_message(con);
-			goto more;
-		}
-		if (con->in_seq > con->in_seq_acked) {
-			prepare_write_ack(con);
-			goto more;
-		}
-	}
-
-	/* Nothing to do! */
-	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
-	dout("try_write nothing else to write.\n");
-	ret = 0;
-out:
-	dout("try_write done on %p ret %d\n", con, ret);
-	return ret;
-}
-
-/*
- * Read what we can from the socket.
- */
-int ceph_con_v1_try_read(struct ceph_connection *con)
-{
-	int ret = -1;
-
-more:
-	dout("try_read start %p state %d\n", con, con->state);
-	if (con->state != CEPH_CON_S_V1_BANNER &&
-	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
-	    con->state != CEPH_CON_S_OPEN)
-		return 0;
-
-	BUG_ON(!con->sock);
-
-	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
-	     con->in_base_pos);
-
-	if (con->state == CEPH_CON_S_V1_BANNER) {
-		ret = read_partial_banner(con);
-		if (ret <= 0)
-			goto out;
-		ret = process_banner(con);
-		if (ret < 0)
-			goto out;
-
-		con->state = CEPH_CON_S_V1_CONNECT_MSG;
-
-		/*
-		 * Received banner is good, exchange connection info.
-		 * Do not reset out_kvec, as sending our banner raced
-		 * with receiving peer banner after connect completed.
-		 */
-		ret = prepare_write_connect(con);
-		if (ret < 0)
-			goto out;
-		prepare_read_connect(con);
-
-		/* Send connection info before awaiting response */
-		goto out;
-	}
-
-	if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
-		ret = read_partial_connect(con);
-		if (ret <= 0)
-			goto out;
-		ret = process_connect(con);
-		if (ret < 0)
-			goto out;
-		goto more;
-	}
-
-	WARN_ON(con->state != CEPH_CON_S_OPEN);
-
-	if (con->in_base_pos < 0) {
-		/*
-		 * skipping + discarding content.
-		 */
-		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
-		if (ret <= 0)
-			goto out;
-		dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
-		con->in_base_pos += ret;
-		if (con->in_base_pos)
-			goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_READY) {
-		/*
-		 * what's next?
-		 */
-		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
-		if (ret <= 0)
-			goto out;
-		dout("try_read got tag %d\n", (int)con->in_tag);
-		switch (con->in_tag) {
-		case CEPH_MSGR_TAG_MSG:
-			prepare_read_message(con);
-			break;
-		case CEPH_MSGR_TAG_ACK:
-			prepare_read_ack(con);
-			break;
-		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
-			prepare_read_keepalive_ack(con);
-			break;
-		case CEPH_MSGR_TAG_CLOSE:
-			ceph_con_close_socket(con);
-			con->state = CEPH_CON_S_CLOSED;
-			goto out;
-		default:
-			goto bad_tag;
-		}
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
-		ret = read_partial_message(con);
-		if (ret <= 0) {
-			switch (ret) {
-			case -EBADMSG:
-				con->error_msg = "bad crc/signature";
-				fallthrough;
-			case -EBADE:
-				ret = -EIO;
-				break;
-			case -EIO:
-				con->error_msg = "io error";
-				break;
-			}
-			goto out;
-		}
-		if (con->in_tag == CEPH_MSGR_TAG_READY)
-			goto more;
-		ceph_con_process_message(con);
-		if (con->state == CEPH_CON_S_OPEN)
-			prepare_read_tag(con);
-		goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
-	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
-		/*
-		 * the final handshake seq exchange is semantically
-		 * equivalent to an ACK
-		 */
-		ret = read_partial_ack(con);
-		if (ret <= 0)
-			goto out;
-		process_ack(con);
-		goto more;
-	}
-	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
-		ret = read_keepalive_ack(con);
-		if (ret <= 0)
-			goto out;
-		goto more;
-	}
-
-out:
-	dout("try_read done on %p ret %d\n", con, ret);
-	return ret;
-
-bad_tag:
-	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
-	con->error_msg = "protocol error, garbage tag";
-	ret = -1;
-	goto out;
-}
-
-
 /*
  * Atomically queue work on a connection after the specified delay.
  * Bump @con reference to avoid races with connection teardown.
@@ -3026,7 +1575,6 @@ static void con_fault(struct ceph_connection *con)
 	}
 }
 
-
 void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
 {
 	u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
@@ -3131,29 +1679,6 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
 }
 EXPORT_SYMBOL(ceph_con_send);
 
-void ceph_con_v1_revoke(struct ceph_connection *con)
-{
-	struct ceph_msg *msg = con->out_msg;
-
-	WARN_ON(con->out_skip);
-	/* footer */
-	if (con->out_msg_done) {
-		con->out_skip += con_out_kvec_skip(con);
-	} else {
-		WARN_ON(!msg->data_length);
-		con->out_skip += sizeof_footer(con);
-	}
-	/* data, middle, front */
-	if (msg->data_length)
-		con->out_skip += msg->cursor.total_resid;
-	if (msg->middle)
-		con->out_skip += con_out_kvec_skip(con);
-	con->out_skip += con_out_kvec_skip(con);
-
-	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
-	     con->out_kvec_bytes, con->out_skip);
-}
-
 /*
  * Revoke a message that was previously queued for send
  */
@@ -3191,26 +1716,6 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 	mutex_unlock(&con->mutex);
 }
 
-void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
-{
-	unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
-	unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
-
-	/* skip rest of message */
-	con->in_base_pos = con->in_base_pos -
-			sizeof(struct ceph_msg_header) -
-			front_len -
-			middle_len -
-			data_len -
-			sizeof(struct ceph_msg_footer);
-
-	con->in_tag = CEPH_MSGR_TAG_READY;
-	con->in_seq++;
-
-	dout("%s con %p in_base_pos %d\n", __func__, con, con->in_base_pos);
-}
-
 /*
  * Revoke a message that we may be reading data into
  */
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
new file mode 100644
index 000000000000..899038a9678e
--- /dev/null
+++ b/net/ceph/messenger_v1.c
@@ -0,0 +1,1502 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
+
+/*
+ * If @buf is NULL, discard up to @len bytes.
+ */
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+	struct kvec iov = {buf, len};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	if (!buf)
+		msg.msg_flags |= MSG_TRUNC;
+
+	iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, len);
+	r = sock_recvmsg(sock, &msg, msg.msg_flags);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
+		     int page_offset, size_t length)
+{
+	struct bio_vec bvec = {
+		.bv_page = page,
+		.bv_offset = page_offset,
+		.bv_len = length
+	};
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	BUG_ON(page_offset + length > PAGE_SIZE);
+	iov_iter_bvec(&msg.msg_iter, READ, &bvec, 1, length);
+	r = sock_recvmsg(sock, &msg, msg.msg_flags);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+/*
+ * write something.  @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+			    size_t kvlen, size_t len, bool more)
+{
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+	int r;
+
+	if (more)
+		msg.msg_flags |= MSG_MORE;
+	else
+		msg.msg_flags |= MSG_EOR;  /* superfluous, but what the hell */
+
+	r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+	if (r == -EAGAIN)
+		r = 0;
+	return r;
+}
+
+/*
+ * @more: either or both of MSG_MORE and MSG_SENDPAGE_NOTLAST
+ */
+static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
+			     int offset, size_t size, int more)
+{
+	ssize_t (*sendpage)(struct socket *sock, struct page *page,
+			    int offset, size_t size, int flags);
+	int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
+	int ret;
+
+	/*
+	 * sendpage cannot properly handle pages with page_count == 0,
+	 * we need to fall back to sendmsg if that's the case.
+	 *
+	 * Same goes for slab pages: skb_can_coalesce() allows
+	 * coalescing neighboring slab objects into a single frag which
+	 * triggers one of hardened usercopy checks.
+	 */
+	if (sendpage_ok(page))
+		sendpage = sock->ops->sendpage;
+	else
+		sendpage = sock_no_sendpage;
+
+	ret = sendpage(sock, page, offset, size, flags);
+	if (ret == -EAGAIN)
+		ret = 0;
+
+	return ret;
+}
+
+static void con_out_kvec_reset(struct ceph_connection *con)
+{
+	BUG_ON(con->out_skip);
+
+	con->out_kvec_left = 0;
+	con->out_kvec_bytes = 0;
+	con->out_kvec_cur = &con->out_kvec[0];
+}
+
+static void con_out_kvec_add(struct ceph_connection *con,
+				size_t size, void *data)
+{
+	int index = con->out_kvec_left;
+
+	BUG_ON(con->out_skip);
+	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+
+	con->out_kvec[index].iov_len = size;
+	con->out_kvec[index].iov_base = data;
+	con->out_kvec_left++;
+	con->out_kvec_bytes += size;
+}
+
+/*
+ * Chop off a kvec from the end.  Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+	int off = con->out_kvec_cur - con->out_kvec;
+	int skip = 0;
+
+	if (con->out_kvec_bytes > 0) {
+		skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
+		BUG_ON(con->out_kvec_bytes < skip);
+		BUG_ON(!con->out_kvec_left);
+		con->out_kvec_bytes -= skip;
+		con->out_kvec_left--;
+	}
+
+	return skip;
+}
+
+static size_t sizeof_footer(struct ceph_connection *con)
+{
+	return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
+	    sizeof(struct ceph_msg_footer) :
+	    sizeof(struct ceph_msg_footer_old);
+}
+
+static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
+{
+	/* Initialize data cursor */
+
+	ceph_msg_data_cursor_init(&msg->cursor, msg, data_len);
+}
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off.  Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->out_msg;
+
+	m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
+
+	dout("prepare_write_message_footer %p\n", con);
+	con_out_kvec_add(con, sizeof_footer(con), &m->footer);
+	if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
+		if (con->ops->sign_message)
+			con->ops->sign_message(m);
+		else
+			m->footer.sig = 0;
+	} else {
+		m->old_footer.flags = m->footer.flags;
+	}
+	con->out_more = m->more_to_follow;
+	con->out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m;
+	u32 crc;
+
+	con_out_kvec_reset(con);
+	con->out_msg_done = false;
+
+	/* Sneak an ack in there first?  If we can get it into the same
+	 * TCP packet that's a good thing. */
+	if (con->in_seq > con->in_seq_acked) {
+		con->in_seq_acked = con->in_seq;
+		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con_out_kvec_add(con, sizeof (con->out_temp_ack),
+			&con->out_temp_ack);
+	}
+
+	ceph_con_get_out_msg(con);
+	m = con->out_msg;
+
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
+	     m, con->out_seq, le16_to_cpu(m->hdr.type),
+	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+	     m->data_length);
+	WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
+	WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
+
+	/* tag + hdr + front + middle */
+	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+	con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
+	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+
+	if (m->middle)
+		con_out_kvec_add(con, m->middle->vec.iov_len,
+			m->middle->vec.iov_base);
+
+	/* fill in hdr crc and finalize hdr */
+	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
+	con->out_msg->hdr.crc = cpu_to_le32(crc);
+	memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
+
+	/* fill in front and middle crc, footer */
+	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
+	con->out_msg->footer.front_crc = cpu_to_le32(crc);
+	if (m->middle) {
+		crc = crc32c(0, m->middle->vec.iov_base,
+				m->middle->vec.iov_len);
+		con->out_msg->footer.middle_crc = cpu_to_le32(crc);
+	} else
+		con->out_msg->footer.middle_crc = 0;
+	dout("%s front_crc %u middle_crc %u\n", __func__,
+	     le32_to_cpu(con->out_msg->footer.front_crc),
+	     le32_to_cpu(con->out_msg->footer.middle_crc));
+	con->out_msg->footer.flags = 0;
+
+	/* is there a data payload? */
+	con->out_msg->footer.data_crc = 0;
+	if (m->data_length) {
+		prepare_message_data(con->out_msg, m->data_length);
+		con->out_more = 1;  /* data + footer will follow */
+	} else {
+		/* no, queue up footer too and be done */
+		prepare_write_message_footer(con);
+	}
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+	dout("prepare_write_ack %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con_out_kvec_reset(con);
+
+	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof (con->out_temp_ack),
+				&con->out_temp_ack);
+
+	con->out_more = 1;  /* more will follow.. eventually.. */
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to share the seq during handshake
+ */
+static void prepare_write_seq(struct ceph_connection *con)
+{
+	dout("prepare_write_seq %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con_out_kvec_reset(con);
+
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof (con->out_temp_ack),
+			 &con->out_temp_ack);
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+	dout("prepare_write_keepalive %p\n", con);
+	con_out_kvec_reset(con);
+	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+		struct timespec64 now;
+
+		ktime_get_real_ts64(&now);
+		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+		ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
+		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
+				 &con->out_temp_keepalive2);
+	} else {
+		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+	}
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static int get_connect_authorizer(struct ceph_connection *con)
+{
+	struct ceph_auth_handshake *auth;
+	int auth_proto;
+
+	if (!con->ops->get_authorizer) {
+		con->auth = NULL;
+		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+		con->out_connect.authorizer_len = 0;
+		return 0;
+	}
+
+	auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
+	if (IS_ERR(auth))
+		return PTR_ERR(auth);
+
+	con->auth = auth;
+	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+	con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
+	return 0;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_connection *con)
+{
+	con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
+	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
+					&con->msgr->my_enc_addr);
+
+	con->out_more = 0;
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static void __prepare_write_connect(struct ceph_connection *con)
+{
+	con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
+	if (con->auth)
+		con_out_kvec_add(con, con->auth->authorizer_buf_len,
+				 con->auth->authorizer_buf);
+
+	con->out_more = 0;
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static int prepare_write_connect(struct ceph_connection *con)
+{
+	unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
+	int proto;
+	int ret;
+
+	switch (con->peer_name.type) {
+	case CEPH_ENTITY_TYPE_MON:
+		proto = CEPH_MONC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_OSD:
+		proto = CEPH_OSDC_PROTOCOL;
+		break;
+	case CEPH_ENTITY_TYPE_MDS:
+		proto = CEPH_MDSC_PROTOCOL;
+		break;
+	default:
+		BUG();
+	}
+
+	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+	     con->connect_seq, global_seq, proto);
+
+	con->out_connect.features =
+	    cpu_to_le64(from_msgr(con->msgr)->supported_features);
+	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+	con->out_connect.global_seq = cpu_to_le32(global_seq);
+	con->out_connect.protocol_version = cpu_to_le32(proto);
+	con->out_connect.flags = 0;
+
+	ret = get_connect_authorizer(con);
+	if (ret)
+		return ret;
+
+	__prepare_write_connect(con);
+	return 0;
+}
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ *  1 -> done
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+	while (con->out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+				       con->out_kvec_left, con->out_kvec_bytes,
+				       con->out_more);
+		if (ret <= 0)
+			goto out;
+		con->out_kvec_bytes -= ret;
+		if (con->out_kvec_bytes == 0)
+			break;            /* done */
+
+		/* account for full iov entries consumed */
+		while (ret >= con->out_kvec_cur->iov_len) {
+			BUG_ON(!con->out_kvec_left);
+			ret -= con->out_kvec_cur->iov_len;
+			con->out_kvec_cur++;
+			con->out_kvec_left--;
+		}
+		/* and for a partially-consumed entry */
+		if (ret) {
+			con->out_kvec_cur->iov_len -= ret;
+			con->out_kvec_cur->iov_base += ret;
+		}
+	}
+	con->out_kvec_left = 0;
+	ret = 1;
+out:
+	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+	     con->out_kvec_bytes, con->out_kvec_left, ret);
+	return ret;  /* done! */
+}
+
+/*
+ * Write as much message data payload as we can.  If we finish, queue
+ * up the footer.
+ *  1 -> done, footer is now queued in out_kvec[].
+ *  0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_message_data(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+	u32 crc;
+
+	dout("%s %p msg %p\n", __func__, con, msg);
+
+	if (!msg->num_data_items)
+		return -EINVAL;
+
+	/*
+	 * Iterate through each page that contains data to be
+	 * written, and send as much as possible for each.
+	 *
+	 * If we are calculating the data crc (the default), we will
+	 * need to map the page.  If we have no pages, they have
+	 * been revoked, so use the zero page.
+	 */
+	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
+	while (cursor->total_resid) {
+		struct page *page;
+		size_t page_offset;
+		size_t length;
+		int ret;
+
+		if (!cursor->resid) {
+			ceph_msg_data_advance(cursor, 0);
+			continue;
+		}
+
+		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+		if (length == cursor->total_resid)
+			more = MSG_MORE;
+		ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
+					more);
+		if (ret <= 0) {
+			if (do_datacrc)
+				msg->footer.data_crc = cpu_to_le32(crc);
+
+			return ret;
+		}
+		if (do_datacrc && cursor->need_crc)
+			crc = ceph_crc32c_page(crc, page, page_offset, length);
+		ceph_msg_data_advance(cursor, (size_t)ret);
+	}
+
+	dout("%s %p msg %p done\n", __func__, con, msg);
+
+	/* prepare and queue up footer, too */
+	if (do_datacrc)
+		msg->footer.data_crc = cpu_to_le32(crc);
+	else
+		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+	con_out_kvec_reset(con);
+	prepare_write_message_footer(con);
+
+	return 1;	/* must return > 0 to indicate success */
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
+	int ret;
+
+	dout("%s %p %d left\n", __func__, con, con->out_skip);
+	while (con->out_skip > 0) {
+		size_t size = min(con->out_skip, (int) PAGE_SIZE);
+
+		if (size == con->out_skip)
+			more = MSG_MORE;
+		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
+					more);
+		if (ret <= 0)
+			goto out;
+		con->out_skip -= ret;
+	}
+	ret = 1;
+out:
+	return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+	dout("prepare_read_banner %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+	dout("prepare_read_connect %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
+static void prepare_read_seq(struct ceph_connection *con)
+{
+	dout("prepare_read_seq %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_SEQ;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+	dout("prepare_read_tag %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_keepalive_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+	dout("prepare_read_message %p\n", con);
+	BUG_ON(con->in_msg != NULL);
+	con->in_base_pos = 0;
+	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+	return 0;
+}
+
+static int read_partial(struct ceph_connection *con,
+			int end, int size, void *object)
+{
+	while (con->in_base_pos < end) {
+		int left = end - con->in_base_pos;
+		int have = size - left;
+		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+		if (ret <= 0)
+			return ret;
+		con->in_base_pos += ret;
+	}
+	return 1;
+}
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+	int size;
+	int end;
+	int ret;
+
+	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+	/* peer's banner */
+	size = strlen(CEPH_BANNER);
+	end = size;
+	ret = read_partial(con, end, size, con->in_banner);
+	if (ret <= 0)
+		goto out;
+
+	size = sizeof (con->actual_peer_addr);
+	end += size;
+	ret = read_partial(con, end, size, &con->actual_peer_addr);
+	if (ret <= 0)
+		goto out;
+	ceph_decode_banner_addr(&con->actual_peer_addr);
+
+	size = sizeof (con->peer_addr_for_me);
+	end += size;
+	ret = read_partial(con, end, size, &con->peer_addr_for_me);
+	if (ret <= 0)
+		goto out;
+	ceph_decode_banner_addr(&con->peer_addr_for_me);
+
+out:
+	return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+	int size;
+	int end;
+	int ret;
+
+	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+	size = sizeof (con->in_reply);
+	end = size;
+	ret = read_partial(con, end, size, &con->in_reply);
+	if (ret <= 0)
+		goto out;
+
+	if (con->auth) {
+		size = le32_to_cpu(con->in_reply.authorizer_len);
+		if (size > con->auth->authorizer_reply_buf_len) {
+			pr_err("authorizer reply too big: %d > %zu\n", size,
+			       con->auth->authorizer_reply_buf_len);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		end += size;
+		ret = read_partial(con, end, size,
+				   con->auth->authorizer_reply_buf);
+		if (ret <= 0)
+			goto out;
+	}
+
+	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+	     con, (int)con->in_reply.tag,
+	     le32_to_cpu(con->in_reply.connect_seq),
+	     le32_to_cpu(con->in_reply.global_seq));
+out:
+	return ret;
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+		pr_err("connect to %s got bad banner\n",
+		       ceph_pr_addr(&con->peer_addr));
+		con->error_msg = "protocol error, bad banner";
+		return -1;
+	}
+	return 0;
+}
+
+static int process_banner(struct ceph_connection *con)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+
+	dout("process_banner on %p\n", con);
+
+	if (verify_hello(con) < 0)
+		return -1;
+
+	/*
+	 * Make sure the other end is who we wanted.  note that the other
+	 * end may not yet know their ip address, so if it's 0.0.0.0, give
+	 * them the benefit of the doubt.
+	 */
+	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+		   sizeof(con->peer_addr)) != 0 &&
+	    !(ceph_addr_is_blank(&con->actual_peer_addr) &&
+	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
+			ceph_pr_addr(&con->peer_addr),
+			le32_to_cpu(con->peer_addr.nonce),
+			ceph_pr_addr(&con->actual_peer_addr),
+			le32_to_cpu(con->actual_peer_addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -1;
+	}
+
+	/*
+	 * did we learn our address?
+	 */
+	if (ceph_addr_is_blank(my_addr)) {
+		memcpy(&my_addr->in_addr,
+		       &con->peer_addr_for_me.in_addr,
+		       sizeof(con->peer_addr_for_me.in_addr));
+		ceph_addr_set_port(my_addr, 0);
+		ceph_encode_my_addr(con->msgr);
+		dout("process_banner learned my addr is %s\n",
+		     ceph_pr_addr(my_addr));
+	}
+
+	return 0;
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+	u64 sup_feat = from_msgr(con->msgr)->supported_features;
+	u64 req_feat = from_msgr(con->msgr)->required_features;
+	u64 server_feat = le64_to_cpu(con->in_reply.features);
+	int ret;
+
+	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+	if (con->auth) {
+		int len = le32_to_cpu(con->in_reply.authorizer_len);
+
+		/*
+		 * Any connection that defines ->get_authorizer()
+		 * should also define ->add_authorizer_challenge() and
+		 * ->verify_authorizer_reply().
+		 *
+		 * See get_connect_authorizer().
+		 */
+		if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+			ret = con->ops->add_authorizer_challenge(
+				    con, con->auth->authorizer_reply_buf, len);
+			if (ret < 0)
+				return ret;
+
+			con_out_kvec_reset(con);
+			__prepare_write_connect(con);
+			prepare_read_connect(con);
+			return 0;
+		}
+
+		if (len) {
+			ret = con->ops->verify_authorizer_reply(con);
+			if (ret < 0) {
+				con->error_msg = "bad authorize reply";
+				return ret;
+			}
+		}
+	}
+
+	switch (con->in_reply.tag) {
+	case CEPH_MSGR_TAG_FEATURES:
+		pr_err("%s%lld %s feature set mismatch,"
+		       " my %llx < server's %llx, missing %llx\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr),
+		       sup_feat, server_feat, server_feat & ~sup_feat);
+		con->error_msg = "missing required protocol features";
+		return -1;
+
+	case CEPH_MSGR_TAG_BADPROTOVER:
+		pr_err("%s%lld %s protocol version mismatch,"
+		       " my %d != server's %d\n",
+		       ENTITY_NAME(con->peer_name),
+		       ceph_pr_addr(&con->peer_addr),
+		       le32_to_cpu(con->out_connect.protocol_version),
+		       le32_to_cpu(con->in_reply.protocol_version));
+		con->error_msg = "protocol version mismatch";
+		return -1;
+
+	case CEPH_MSGR_TAG_BADAUTHORIZER:
+		con->auth_retry++;
+		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+		     con->auth_retry);
+		if (con->auth_retry == 2) {
+			con->error_msg = "connect authorization failure";
+			return -1;
+		}
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RESETSESSION:
+		/*
+		 * If we connected with a large connect_seq but the peer
+		 * has no record of a session with us (no connection, or
+		 * connect_seq == 0), they will send RESETSESION to indicate
+		 * that they must have reset their session, and may have
+		 * dropped messages.
+		 */
+		dout("process_connect got RESET peer seq %u\n",
+		     le32_to_cpu(con->in_reply.connect_seq));
+		pr_info("%s%lld %s session reset\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr));
+		ceph_con_reset_session(con);
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+
+		/* Tell ceph about it. */
+		mutex_unlock(&con->mutex);
+		if (con->ops->peer_reset)
+			con->ops->peer_reset(con);
+		mutex_lock(&con->mutex);
+		if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
+			return -EAGAIN;
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_SESSION:
+		/*
+		 * If we sent a smaller connect_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
+		     le32_to_cpu(con->out_connect.connect_seq),
+		     le32_to_cpu(con->in_reply.connect_seq));
+		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_RETRY_GLOBAL:
+		/*
+		 * If we sent a smaller global_seq than the peer has, try
+		 * again with a larger value.
+		 */
+		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_reply.global_seq));
+		ceph_get_global_seq(con->msgr,
+				    le32_to_cpu(con->in_reply.global_seq));
+		con_out_kvec_reset(con);
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			return ret;
+		prepare_read_connect(con);
+		break;
+
+	case CEPH_MSGR_TAG_SEQ:
+	case CEPH_MSGR_TAG_READY:
+		if (req_feat & ~server_feat) {
+			pr_err("%s%lld %s protocol feature mismatch,"
+			       " my required %llx > server's %llx, need %llx\n",
+			       ENTITY_NAME(con->peer_name),
+			       ceph_pr_addr(&con->peer_addr),
+			       req_feat, server_feat, req_feat & ~server_feat);
+			con->error_msg = "missing required protocol features";
+			return -1;
+		}
+
+		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
+		con->state = CEPH_CON_S_OPEN;
+		con->auth_retry = 0;    /* we authenticated; clear flag */
+		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+		con->connect_seq++;
+		con->peer_features = server_feat;
+		dout("process_connect got READY gseq %d cseq %d (%d)\n",
+		     con->peer_global_seq,
+		     le32_to_cpu(con->in_reply.connect_seq),
+		     con->connect_seq);
+		WARN_ON(con->connect_seq !=
+			le32_to_cpu(con->in_reply.connect_seq));
+
+		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+
+		con->delay = 0;      /* reset backoff memory */
+
+		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+			prepare_write_seq(con);
+			prepare_read_seq(con);
+		} else {
+			prepare_read_tag(con);
+		}
+		break;
+
+	case CEPH_MSGR_TAG_WAIT:
+		/*
+		 * If there is a connection race (we are opening
+		 * connections to each other), one of us may just have
+		 * to WAIT.  This shouldn't happen if we are the
+		 * client.
+		 */
+		con->error_msg = "protocol error, got WAIT as client";
+		return -1;
+
+	default:
+		con->error_msg = "protocol error, garbage tag during connect";
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+	int size = sizeof (con->in_temp_ack);
+	int end = size;
+
+	return read_partial(con, end, size, &con->in_temp_ack);
+}
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+	u64 ack = le64_to_cpu(con->in_temp_ack);
+
+	if (con->in_tag == CEPH_MSGR_TAG_ACK)
+		ceph_con_discard_sent(con, ack);
+	else
+		ceph_con_discard_requeued(con, ack);
+
+	prepare_read_tag(con);
+}
+
+static int read_partial_message_section(struct ceph_connection *con,
+					struct kvec *section,
+					unsigned int sec_len, u32 *crc)
+{
+	int ret, left;
+
+	BUG_ON(!section);
+
+	while (section->iov_len < sec_len) {
+		BUG_ON(section->iov_base == NULL);
+		left = sec_len - section->iov_len;
+		ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+				       section->iov_len, left);
+		if (ret <= 0)
+			return ret;
+		section->iov_len += ret;
+	}
+	if (section->iov_len == sec_len)
+		*crc = crc32c(0, section->iov_base, section->iov_len);
+
+	return 1;
+}
+
+static int read_partial_msg_data(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->in_msg;
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	struct page *page;
+	size_t page_offset;
+	size_t length;
+	u32 crc = 0;
+	int ret;
+
+	if (!msg->num_data_items)
+		return -EIO;
+
+	if (do_datacrc)
+		crc = con->in_data_crc;
+	while (cursor->total_resid) {
+		if (!cursor->resid) {
+			ceph_msg_data_advance(cursor, 0);
+			continue;
+		}
+
+		page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
+		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
+		if (ret <= 0) {
+			if (do_datacrc)
+				con->in_data_crc = crc;
+
+			return ret;
+		}
+
+		if (do_datacrc)
+			crc = ceph_crc32c_page(crc, page, page_offset, ret);
+		ceph_msg_data_advance(cursor, (size_t)ret);
+	}
+	if (do_datacrc)
+		con->in_data_crc = crc;
+
+	return 1;	/* must return > 0 to indicate success */
+}
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+	struct ceph_msg *m = con->in_msg;
+	int size;
+	int end;
+	int ret;
+	unsigned int front_len, middle_len, data_len;
+	bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+	bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
+	u64 seq;
+	u32 crc;
+
+	dout("read_partial_message con %p msg %p\n", con, m);
+
+	/* header */
+	size = sizeof (con->in_hdr);
+	end = size;
+	ret = read_partial(con, end, size, &con->in_hdr);
+	if (ret <= 0)
+		return ret;
+
+	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
+	if (cpu_to_le32(crc) != con->in_hdr.crc) {
+		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
+		       crc, con->in_hdr.crc);
+		return -EBADMSG;
+	}
+
+	front_len = le32_to_cpu(con->in_hdr.front_len);
+	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+		return -EIO;
+	middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
+		return -EIO;
+	data_len = le32_to_cpu(con->in_hdr.data_len);
+	if (data_len > CEPH_MSG_MAX_DATA_LEN)
+		return -EIO;
+
+	/* verify seq# */
+	seq = le64_to_cpu(con->in_hdr.seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr),
+			seq, con->in_seq + 1);
+		con->in_base_pos = -front_len - middle_len - data_len -
+			sizeof_footer(con);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		return 1;
+	} else if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("read_partial_message bad seq %lld expected %lld\n",
+		       seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADE;
+	}
+
+	/* allocate message? */
+	if (!con->in_msg) {
+		int skip = 0;
+
+		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+		     front_len, data_len);
+		ret = ceph_con_in_msg_alloc(con, &con->in_hdr, &skip);
+		if (ret < 0)
+			return ret;
+
+		BUG_ON(!con->in_msg ^ skip);
+		if (skip) {
+			/* skip this message */
+			dout("alloc_msg said skip message\n");
+			con->in_base_pos = -front_len - middle_len - data_len -
+				sizeof_footer(con);
+			con->in_tag = CEPH_MSGR_TAG_READY;
+			con->in_seq++;
+			return 1;
+		}
+
+		BUG_ON(!con->in_msg);
+		BUG_ON(con->in_msg->con != con);
+		m = con->in_msg;
+		m->front.iov_len = 0;    /* haven't read it yet */
+		if (m->middle)
+			m->middle->vec.iov_len = 0;
+
+		/* prepare for data payload, if any */
+
+		if (data_len)
+			prepare_message_data(con->in_msg, data_len);
+	}
+
+	/* front */
+	ret = read_partial_message_section(con, &m->front, front_len,
+					   &con->in_front_crc);
+	if (ret <= 0)
+		return ret;
+
+	/* middle */
+	if (m->middle) {
+		ret = read_partial_message_section(con, &m->middle->vec,
+						   middle_len,
+						   &con->in_middle_crc);
+		if (ret <= 0)
+			return ret;
+	}
+
+	/* (page) data */
+	if (data_len) {
+		ret = read_partial_msg_data(con);
+		if (ret <= 0)
+			return ret;
+	}
+
+	/* footer */
+	size = sizeof_footer(con);
+	end += size;
+	ret = read_partial(con, end, size, &m->footer);
+	if (ret <= 0)
+		return ret;
+
+	if (!need_sign) {
+		m->footer.flags = m->old_footer.flags;
+		m->footer.sig = 0;
+	}
+
+	dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+	     m, front_len, m->footer.front_crc, middle_len,
+	     m->footer.middle_crc, data_len, m->footer.data_crc);
+
+	/* crc ok? */
+	if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+		pr_err("read_partial_message %p front crc %u != exp. %u\n",
+		       m, con->in_front_crc, m->footer.front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+		pr_err("read_partial_message %p middle crc %u != exp %u\n",
+		       m, con->in_middle_crc, m->footer.middle_crc);
+		return -EBADMSG;
+	}
+	if (do_datacrc &&
+	    (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+	    con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+		pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+		       con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+		return -EBADMSG;
+	}
+
+	if (need_sign && con->ops->check_message_signature &&
+	    con->ops->check_message_signature(m)) {
+		pr_err("read_partial_message %p signature check failed\n", m);
+		return -EBADMSG;
+	}
+
+	return 1; /* done! */
+}
+
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+	struct ceph_timespec ceph_ts;
+	size_t size = sizeof(ceph_ts);
+	int ret = read_partial(con, size, size, &ceph_ts);
+	if (ret <= 0)
+		return ret;
+	ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
+	prepare_read_tag(con);
+	return 1;
+}
+
+/*
+ * Read what we can from the socket.
+ */
+int ceph_con_v1_try_read(struct ceph_connection *con)
+{
+	int ret = -1;
+
+more:
+	dout("try_read start %p state %d\n", con, con->state);
+	if (con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
+		return 0;
+
+	BUG_ON(!con->sock);
+
+	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+	     con->in_base_pos);
+
+	if (con->state == CEPH_CON_S_V1_BANNER) {
+		ret = read_partial_banner(con);
+		if (ret <= 0)
+			goto out;
+		ret = process_banner(con);
+		if (ret < 0)
+			goto out;
+
+		con->state = CEPH_CON_S_V1_CONNECT_MSG;
+
+		/*
+		 * Received banner is good, exchange connection info.
+		 * Do not reset out_kvec, as sending our banner raced
+		 * with receiving peer banner after connect completed.
+		 */
+		ret = prepare_write_connect(con);
+		if (ret < 0)
+			goto out;
+		prepare_read_connect(con);
+
+		/* Send connection info before awaiting response */
+		goto out;
+	}
+
+	if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
+		ret = read_partial_connect(con);
+		if (ret <= 0)
+			goto out;
+		ret = process_connect(con);
+		if (ret < 0)
+			goto out;
+		goto more;
+	}
+
+	WARN_ON(con->state != CEPH_CON_S_OPEN);
+
+	if (con->in_base_pos < 0) {
+		/*
+		 * skipping + discarding content.
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
+		if (ret <= 0)
+			goto out;
+		dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
+		con->in_base_pos += ret;
+		if (con->in_base_pos)
+			goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_READY) {
+		/*
+		 * what's next?
+		 */
+		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+		if (ret <= 0)
+			goto out;
+		dout("try_read got tag %d\n", (int)con->in_tag);
+		switch (con->in_tag) {
+		case CEPH_MSGR_TAG_MSG:
+			prepare_read_message(con);
+			break;
+		case CEPH_MSGR_TAG_ACK:
+			prepare_read_ack(con);
+			break;
+		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+			prepare_read_keepalive_ack(con);
+			break;
+		case CEPH_MSGR_TAG_CLOSE:
+			ceph_con_close_socket(con);
+			con->state = CEPH_CON_S_CLOSED;
+			goto out;
+		default:
+			goto bad_tag;
+		}
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+		ret = read_partial_message(con);
+		if (ret <= 0) {
+			switch (ret) {
+			case -EBADMSG:
+				con->error_msg = "bad crc/signature";
+				fallthrough;
+			case -EBADE:
+				ret = -EIO;
+				break;
+			case -EIO:
+				con->error_msg = "io error";
+				break;
+			}
+			goto out;
+		}
+		if (con->in_tag == CEPH_MSGR_TAG_READY)
+			goto more;
+		ceph_con_process_message(con);
+		if (con->state == CEPH_CON_S_OPEN)
+			prepare_read_tag(con);
+		goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
+	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
+		/*
+		 * the final handshake seq exchange is semantically
+		 * equivalent to an ACK
+		 */
+		ret = read_partial_ack(con);
+		if (ret <= 0)
+			goto out;
+		process_ack(con);
+		goto more;
+	}
+	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+		ret = read_keepalive_ack(con);
+		if (ret <= 0)
+			goto out;
+		goto more;
+	}
+
+out:
+	dout("try_read done on %p ret %d\n", con, ret);
+	return ret;
+
+bad_tag:
+	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+	con->error_msg = "protocol error, garbage tag";
+	ret = -1;
+	goto out;
+}
+
+/*
+ * Write something to the socket.  Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+int ceph_con_v1_try_write(struct ceph_connection *con)
+{
+	int ret = 1;
+
+	dout("try_write start %p state %d\n", con, con->state);
+	if (con->state != CEPH_CON_S_PREOPEN &&
+	    con->state != CEPH_CON_S_V1_BANNER &&
+	    con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+	    con->state != CEPH_CON_S_OPEN)
+		return 0;
+
+	/* open the socket first? */
+	if (con->state == CEPH_CON_S_PREOPEN) {
+		BUG_ON(con->sock);
+		con->state = CEPH_CON_S_V1_BANNER;
+
+		con_out_kvec_reset(con);
+		prepare_write_banner(con);
+		prepare_read_banner(con);
+
+		BUG_ON(con->in_msg);
+		con->in_tag = CEPH_MSGR_TAG_READY;
+		dout("try_write initiating connect on %p new state %d\n",
+		     con, con->state);
+		ret = ceph_tcp_connect(con);
+		if (ret < 0) {
+			con->error_msg = "connect error";
+			goto out;
+		}
+	}
+
+more:
+	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+	BUG_ON(!con->sock);
+
+	/* kvec data queued? */
+	if (con->out_kvec_left) {
+		ret = write_partial_kvec(con);
+		if (ret <= 0)
+			goto out;
+	}
+	if (con->out_skip) {
+		ret = write_partial_skip(con);
+		if (ret <= 0)
+			goto out;
+	}
+
+	/* msg pages? */
+	if (con->out_msg) {
+		if (con->out_msg_done) {
+			ceph_msg_put(con->out_msg);
+			con->out_msg = NULL;   /* we're done with this one */
+			goto do_next;
+		}
+
+		ret = write_partial_message_data(con);
+		if (ret == 1)
+			goto more;  /* we need to send the footer, too! */
+		if (ret == 0)
+			goto out;
+		if (ret < 0) {
+			dout("try_write write_partial_message_data err %d\n",
+			     ret);
+			goto out;
+		}
+	}
+
+do_next:
+	if (con->state == CEPH_CON_S_OPEN) {
+		if (ceph_con_flag_test_and_clear(con,
+				CEPH_CON_F_KEEPALIVE_PENDING)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
+		/* is anything else pending? */
+		if (!list_empty(&con->out_queue)) {
+			prepare_write_message(con);
+			goto more;
+		}
+		if (con->in_seq > con->in_seq_acked) {
+			prepare_write_ack(con);
+			goto more;
+		}
+	}
+
+	/* Nothing to do! */
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	dout("try_write nothing else to write.\n");
+	ret = 0;
+out:
+	dout("try_write done on %p ret %d\n", con, ret);
+	return ret;
+}
+
+void ceph_con_v1_revoke(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	WARN_ON(con->out_skip);
+	/* footer */
+	if (con->out_msg_done) {
+		con->out_skip += con_out_kvec_skip(con);
+	} else {
+		WARN_ON(!msg->data_length);
+		con->out_skip += sizeof_footer(con);
+	}
+	/* data, middle, front */
+	if (msg->data_length)
+		con->out_skip += msg->cursor.total_resid;
+	if (msg->middle)
+		con->out_skip += con_out_kvec_skip(con);
+	con->out_skip += con_out_kvec_skip(con);
+
+	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
+	     con->out_kvec_bytes, con->out_skip);
+}
+
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
+{
+	unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
+	unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
+
+	/* skip rest of message */
+	con->in_base_pos = con->in_base_pos -
+			sizeof(struct ceph_msg_header) -
+			front_len -
+			middle_len -
+			data_len -
+			sizeof(struct ceph_msg_footer);
+
+	con->in_tag = CEPH_MSGR_TAG_READY;
+	con->in_seq++;
+
+	dout("%s con %p in_base_pos %d\n", __func__, con, con->in_base_pos);
+}
+
+bool ceph_con_v1_opened(struct ceph_connection *con)
+{
+	return con->connect_seq;
+}
+
+void ceph_con_v1_reset_session(struct ceph_connection *con)
+{
+	con->connect_seq = 0;
+	con->peer_global_seq = 0;
+}
+
+void ceph_con_v1_reset_protocol(struct ceph_connection *con)
+{
+	con->out_skip = 0;
+}
-- 
cgit v1.2.3


From a56dd9bf47220c3206f27075af8bdfb219a2a3cf Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 12 Nov 2020 16:31:41 +0100
Subject: libceph: move msgr1 protocol specific fields to its own struct

A couple whitespace fixups, no functional changes.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/messenger.h |  76 ++++----
 net/ceph/messenger.c           |   8 +-
 net/ceph/messenger_v1.c        | 420 +++++++++++++++++++++--------------------
 3 files changed, 257 insertions(+), 247 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index b5268127c55e..54a64e8dfce6 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -264,6 +264,43 @@ struct ceph_msg {
 #define BASE_DELAY_INTERVAL	(HZ / 4)
 #define MAX_DELAY_INTERVAL	(15 * HZ)
 
+struct ceph_connection_v1_info {
+	struct kvec out_kvec[8],         /* sending header/footer data */
+		*out_kvec_cur;
+	int out_kvec_left;   /* kvec's left in out_kvec */
+	int out_skip;        /* skip this many bytes */
+	int out_kvec_bytes;  /* total bytes left */
+	bool out_more;       /* there is more data after the kvecs */
+	bool out_msg_done;
+
+	struct ceph_auth_handshake *auth;
+	int auth_retry;       /* true if we need a newer authorizer */
+
+	/* connection negotiation temps */
+	u8 in_banner[CEPH_BANNER_MAX_LEN];
+	struct ceph_entity_addr actual_peer_addr;
+	struct ceph_entity_addr peer_addr_for_me;
+	struct ceph_msg_connect out_connect;
+	struct ceph_msg_connect_reply in_reply;
+
+	int in_base_pos;     /* bytes read */
+
+	/* message in temps */
+	u8 in_tag;           /* protocol control byte */
+	struct ceph_msg_header in_hdr;
+	__le64 in_temp_ack;  /* for reading an ack */
+
+	/* message out temps */
+	struct ceph_msg_header out_hdr;
+	__le64 out_temp_ack;  /* for writing an ack */
+	struct ceph_timespec out_temp_keepalive2;  /* for writing keepalive2
+						      stamp */
+
+	u32 connect_seq;      /* identify the most recent connection
+				 attempt for this session */
+	u32 peer_global_seq;  /* peer's global seq for this connection */
+};
+
 /*
  * A single connection with another host.
  *
@@ -281,21 +318,13 @@ struct ceph_connection {
 	int state;  /* CEPH_CON_S_* */
 	atomic_t sock_state;
 	struct socket *sock;
-	struct ceph_entity_addr peer_addr; /* peer address */
-	struct ceph_entity_addr peer_addr_for_me;
 
 	unsigned long flags;  /* CEPH_CON_F_* */
 	const char *error_msg;  /* error message, if any */
 
 	struct ceph_entity_name peer_name; /* peer name */
-
+	struct ceph_entity_addr peer_addr; /* peer address */
 	u64 peer_features;
-	u32 connect_seq;      /* identify the most recent connection
-				 attempt for this connection, client */
-	u32 peer_global_seq;  /* peer's global seq for this connection */
-
-	struct ceph_auth_handshake *auth;
-	int auth_retry;       /* true if we need a newer authorizer */
 
 	struct mutex mutex;
 
@@ -306,41 +335,18 @@ struct ceph_connection {
 
 	u64 in_seq, in_seq_acked;  /* last message received, acked */
 
-	/* connection negotiation temps */
-	char in_banner[CEPH_BANNER_MAX_LEN];
-	struct ceph_msg_connect out_connect;
-	struct ceph_msg_connect_reply in_reply;
-	struct ceph_entity_addr actual_peer_addr;
-
-	/* message out temps */
-	struct ceph_msg_header out_hdr;
+	struct ceph_msg *in_msg;
 	struct ceph_msg *out_msg;        /* sending message (== tail of
 					    out_sent) */
-	bool out_msg_done;
-
-	struct kvec out_kvec[8],         /* sending header/footer data */
-		*out_kvec_cur;
-	int out_kvec_left;   /* kvec's left in out_kvec */
-	int out_skip;        /* skip this many bytes */
-	int out_kvec_bytes;  /* total bytes left */
-	int out_more;        /* there is more data after the kvecs */
-	__le64 out_temp_ack; /* for writing an ack */
-	struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2
-						     stamp */
 
-	/* message in temps */
-	struct ceph_msg_header in_hdr;
-	struct ceph_msg *in_msg;
 	u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
 
-	char in_tag;         /* protocol control byte */
-	int in_base_pos;     /* bytes read */
-	__le64 in_temp_ack;  /* for reading an ack */
-
 	struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */
 
 	struct delayed_work work;	    /* send|recv work */
 	unsigned long       delay;          /* current delay interval */
+
+	struct ceph_connection_v1_info v1;
 };
 
 extern struct page *ceph_zero_page;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 544cfdbe52d6..4fb3c33a7b03 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1448,11 +1448,11 @@ static void con_fault_finish(struct ceph_connection *con)
 	 * in case we faulted due to authentication, invalidate our
 	 * current tickets so that we can get new ones.
 	 */
-	if (con->auth_retry) {
-		dout("auth_retry %d, invalidating\n", con->auth_retry);
+	if (con->v1.auth_retry) {
+		dout("auth_retry %d, invalidating\n", con->v1.auth_retry);
 		if (con->ops->invalidate_authorizer)
 			con->ops->invalidate_authorizer(con);
-		con->auth_retry = 0;
+		con->v1.auth_retry = 0;
 	}
 
 	if (con->ops->fault)
@@ -1631,7 +1631,7 @@ static void clear_standby(struct ceph_connection *con)
 	if (con->state == CEPH_CON_S_STANDBY) {
 		dout("clear_standby %p and ++connect_seq\n", con);
 		con->state = CEPH_CON_S_PREOPEN;
-		con->connect_seq++;
+		con->v1.connect_seq++;
 		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
 		WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
 	}
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index 899038a9678e..04f653b3c897 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -110,25 +110,25 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
 
 static void con_out_kvec_reset(struct ceph_connection *con)
 {
-	BUG_ON(con->out_skip);
+	BUG_ON(con->v1.out_skip);
 
-	con->out_kvec_left = 0;
-	con->out_kvec_bytes = 0;
-	con->out_kvec_cur = &con->out_kvec[0];
+	con->v1.out_kvec_left = 0;
+	con->v1.out_kvec_bytes = 0;
+	con->v1.out_kvec_cur = &con->v1.out_kvec[0];
 }
 
 static void con_out_kvec_add(struct ceph_connection *con,
 				size_t size, void *data)
 {
-	int index = con->out_kvec_left;
+	int index = con->v1.out_kvec_left;
 
-	BUG_ON(con->out_skip);
-	BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+	BUG_ON(con->v1.out_skip);
+	BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec));
 
-	con->out_kvec[index].iov_len = size;
-	con->out_kvec[index].iov_base = data;
-	con->out_kvec_left++;
-	con->out_kvec_bytes += size;
+	con->v1.out_kvec[index].iov_len = size;
+	con->v1.out_kvec[index].iov_base = data;
+	con->v1.out_kvec_left++;
+	con->v1.out_kvec_bytes += size;
 }
 
 /*
@@ -138,15 +138,14 @@ static void con_out_kvec_add(struct ceph_connection *con,
  */
 static int con_out_kvec_skip(struct ceph_connection *con)
 {
-	int off = con->out_kvec_cur - con->out_kvec;
 	int skip = 0;
 
-	if (con->out_kvec_bytes > 0) {
-		skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
-		BUG_ON(con->out_kvec_bytes < skip);
-		BUG_ON(!con->out_kvec_left);
-		con->out_kvec_bytes -= skip;
-		con->out_kvec_left--;
+	if (con->v1.out_kvec_bytes > 0) {
+		skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len;
+		BUG_ON(con->v1.out_kvec_bytes < skip);
+		BUG_ON(!con->v1.out_kvec_left);
+		con->v1.out_kvec_bytes -= skip;
+		con->v1.out_kvec_left--;
 	}
 
 	return skip;
@@ -186,8 +185,8 @@ static void prepare_write_message_footer(struct ceph_connection *con)
 	} else {
 		m->old_footer.flags = m->footer.flags;
 	}
-	con->out_more = m->more_to_follow;
-	con->out_msg_done = true;
+	con->v1.out_more = m->more_to_follow;
+	con->v1.out_msg_done = true;
 }
 
 /*
@@ -199,16 +198,16 @@ static void prepare_write_message(struct ceph_connection *con)
 	u32 crc;
 
 	con_out_kvec_reset(con);
-	con->out_msg_done = false;
+	con->v1.out_msg_done = false;
 
 	/* Sneak an ack in there first?  If we can get it into the same
 	 * TCP packet that's a good thing. */
 	if (con->in_seq > con->in_seq_acked) {
 		con->in_seq_acked = con->in_seq;
 		con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-		con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-		con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			&con->out_temp_ack);
+		con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+		con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+			&con->v1.out_temp_ack);
 	}
 
 	ceph_con_get_out_msg(con);
@@ -223,7 +222,7 @@ static void prepare_write_message(struct ceph_connection *con)
 
 	/* tag + hdr + front + middle */
 	con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
-	con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
+	con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr);
 	con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
 
 	if (m->middle)
@@ -233,7 +232,7 @@ static void prepare_write_message(struct ceph_connection *con)
 	/* fill in hdr crc and finalize hdr */
 	crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
 	con->out_msg->hdr.crc = cpu_to_le32(crc);
-	memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
+	memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr));
 
 	/* fill in front and middle crc, footer */
 	crc = crc32c(0, m->front.iov_base, m->front.iov_len);
@@ -253,7 +252,7 @@ static void prepare_write_message(struct ceph_connection *con)
 	con->out_msg->footer.data_crc = 0;
 	if (m->data_length) {
 		prepare_message_data(con->out_msg, m->data_length);
-		con->out_more = 1;  /* data + footer will follow */
+		con->v1.out_more = 1;  /* data + footer will follow */
 	} else {
 		/* no, queue up footer too and be done */
 		prepare_write_message_footer(con);
@@ -275,11 +274,11 @@ static void prepare_write_ack(struct ceph_connection *con)
 
 	con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
 
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-				&con->out_temp_ack);
+	con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+			 &con->v1.out_temp_ack);
 
-	con->out_more = 1;  /* more will follow.. eventually.. */
+	con->v1.out_more = 1;  /* more will follow.. eventually.. */
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
@@ -294,9 +293,9 @@ static void prepare_write_seq(struct ceph_connection *con)
 
 	con_out_kvec_reset(con);
 
-	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
-	con_out_kvec_add(con, sizeof (con->out_temp_ack),
-			 &con->out_temp_ack);
+	con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+			 &con->v1.out_temp_ack);
 
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
@@ -313,9 +312,9 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 
 		ktime_get_real_ts64(&now);
 		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
-		ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
-		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
-				 &con->out_temp_keepalive2);
+		ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now);
+		con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2),
+				 &con->v1.out_temp_keepalive2);
 	} else {
 		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
 	}
@@ -332,19 +331,20 @@ static int get_connect_authorizer(struct ceph_connection *con)
 	int auth_proto;
 
 	if (!con->ops->get_authorizer) {
-		con->auth = NULL;
-		con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
-		con->out_connect.authorizer_len = 0;
+		con->v1.auth = NULL;
+		con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+		con->v1.out_connect.authorizer_len = 0;
 		return 0;
 	}
 
-	auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
+	auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry);
 	if (IS_ERR(auth))
 		return PTR_ERR(auth);
 
-	con->auth = auth;
-	con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
-	con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
+	con->v1.auth = auth;
+	con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+	con->v1.out_connect.authorizer_len =
+		cpu_to_le32(auth->authorizer_buf_len);
 	return 0;
 }
 
@@ -357,18 +357,19 @@ static void prepare_write_banner(struct ceph_connection *con)
 	con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
 					&con->msgr->my_enc_addr);
 
-	con->out_more = 0;
+	con->v1.out_more = 0;
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
 static void __prepare_write_connect(struct ceph_connection *con)
 {
-	con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
-	if (con->auth)
-		con_out_kvec_add(con, con->auth->authorizer_buf_len,
-				 con->auth->authorizer_buf);
+	con_out_kvec_add(con, sizeof(con->v1.out_connect),
+			 &con->v1.out_connect);
+	if (con->v1.auth)
+		con_out_kvec_add(con, con->v1.auth->authorizer_buf_len,
+				 con->v1.auth->authorizer_buf);
 
-	con->out_more = 0;
+	con->v1.out_more = 0;
 	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
 }
 
@@ -393,15 +394,15 @@ static int prepare_write_connect(struct ceph_connection *con)
 	}
 
 	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
-	     con->connect_seq, global_seq, proto);
+	     con->v1.connect_seq, global_seq, proto);
 
-	con->out_connect.features =
-	    cpu_to_le64(from_msgr(con->msgr)->supported_features);
-	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
-	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
-	con->out_connect.global_seq = cpu_to_le32(global_seq);
-	con->out_connect.protocol_version = cpu_to_le32(proto);
-	con->out_connect.flags = 0;
+	con->v1.out_connect.features =
+		cpu_to_le64(from_msgr(con->msgr)->supported_features);
+	con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+	con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq);
+	con->v1.out_connect.global_seq = cpu_to_le32(global_seq);
+	con->v1.out_connect.protocol_version = cpu_to_le32(proto);
+	con->v1.out_connect.flags = 0;
 
 	ret = get_connect_authorizer(con);
 	if (ret)
@@ -421,35 +422,36 @@ static int write_partial_kvec(struct ceph_connection *con)
 {
 	int ret;
 
-	dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
-	while (con->out_kvec_bytes > 0) {
-		ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
-				       con->out_kvec_left, con->out_kvec_bytes,
-				       con->out_more);
+	dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes);
+	while (con->v1.out_kvec_bytes > 0) {
+		ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur,
+				       con->v1.out_kvec_left,
+				       con->v1.out_kvec_bytes,
+				       con->v1.out_more);
 		if (ret <= 0)
 			goto out;
-		con->out_kvec_bytes -= ret;
-		if (con->out_kvec_bytes == 0)
+		con->v1.out_kvec_bytes -= ret;
+		if (!con->v1.out_kvec_bytes)
 			break;            /* done */
 
 		/* account for full iov entries consumed */
-		while (ret >= con->out_kvec_cur->iov_len) {
-			BUG_ON(!con->out_kvec_left);
-			ret -= con->out_kvec_cur->iov_len;
-			con->out_kvec_cur++;
-			con->out_kvec_left--;
+		while (ret >= con->v1.out_kvec_cur->iov_len) {
+			BUG_ON(!con->v1.out_kvec_left);
+			ret -= con->v1.out_kvec_cur->iov_len;
+			con->v1.out_kvec_cur++;
+			con->v1.out_kvec_left--;
 		}
 		/* and for a partially-consumed entry */
 		if (ret) {
-			con->out_kvec_cur->iov_len -= ret;
-			con->out_kvec_cur->iov_base += ret;
+			con->v1.out_kvec_cur->iov_len -= ret;
+			con->v1.out_kvec_cur->iov_base += ret;
 		}
 	}
-	con->out_kvec_left = 0;
+	con->v1.out_kvec_left = 0;
 	ret = 1;
 out:
 	dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
-	     con->out_kvec_bytes, con->out_kvec_left, ret);
+	     con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret);
 	return ret;  /* done! */
 }
 
@@ -530,17 +532,17 @@ static int write_partial_skip(struct ceph_connection *con)
 	int more = MSG_MORE | MSG_SENDPAGE_NOTLAST;
 	int ret;
 
-	dout("%s %p %d left\n", __func__, con, con->out_skip);
-	while (con->out_skip > 0) {
-		size_t size = min(con->out_skip, (int) PAGE_SIZE);
+	dout("%s %p %d left\n", __func__, con, con->v1.out_skip);
+	while (con->v1.out_skip > 0) {
+		size_t size = min(con->v1.out_skip, (int)PAGE_SIZE);
 
-		if (size == con->out_skip)
+		if (size == con->v1.out_skip)
 			more = MSG_MORE;
 		ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
 					more);
 		if (ret <= 0)
 			goto out;
-		con->out_skip -= ret;
+		con->v1.out_skip -= ret;
 	}
 	ret = 1;
 out:
@@ -553,39 +555,39 @@ out:
 static void prepare_read_banner(struct ceph_connection *con)
 {
 	dout("prepare_read_banner %p\n", con);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 }
 
 static void prepare_read_connect(struct ceph_connection *con)
 {
 	dout("prepare_read_connect %p\n", con);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 }
 
 static void prepare_read_ack(struct ceph_connection *con)
 {
 	dout("prepare_read_ack %p\n", con);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 }
 
 static void prepare_read_seq(struct ceph_connection *con)
 {
 	dout("prepare_read_seq %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_SEQ;
+	con->v1.in_base_pos = 0;
+	con->v1.in_tag = CEPH_MSGR_TAG_SEQ;
 }
 
 static void prepare_read_tag(struct ceph_connection *con)
 {
 	dout("prepare_read_tag %p\n", con);
-	con->in_base_pos = 0;
-	con->in_tag = CEPH_MSGR_TAG_READY;
+	con->v1.in_base_pos = 0;
+	con->v1.in_tag = CEPH_MSGR_TAG_READY;
 }
 
 static void prepare_read_keepalive_ack(struct ceph_connection *con)
 {
 	dout("prepare_read_keepalive_ack %p\n", con);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 }
 
 /*
@@ -595,7 +597,7 @@ static int prepare_read_message(struct ceph_connection *con)
 {
 	dout("prepare_read_message %p\n", con);
 	BUG_ON(con->in_msg != NULL);
-	con->in_base_pos = 0;
+	con->v1.in_base_pos = 0;
 	con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
 	return 0;
 }
@@ -603,13 +605,13 @@ static int prepare_read_message(struct ceph_connection *con)
 static int read_partial(struct ceph_connection *con,
 			int end, int size, void *object)
 {
-	while (con->in_base_pos < end) {
-		int left = end - con->in_base_pos;
+	while (con->v1.in_base_pos < end) {
+		int left = end - con->v1.in_base_pos;
 		int have = size - left;
 		int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
 		if (ret <= 0)
 			return ret;
-		con->in_base_pos += ret;
+		con->v1.in_base_pos += ret;
 	}
 	return 1;
 }
@@ -623,28 +625,28 @@ static int read_partial_banner(struct ceph_connection *con)
 	int end;
 	int ret;
 
-	dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+	dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos);
 
 	/* peer's banner */
 	size = strlen(CEPH_BANNER);
 	end = size;
-	ret = read_partial(con, end, size, con->in_banner);
+	ret = read_partial(con, end, size, con->v1.in_banner);
 	if (ret <= 0)
 		goto out;
 
-	size = sizeof (con->actual_peer_addr);
+	size = sizeof(con->v1.actual_peer_addr);
 	end += size;
-	ret = read_partial(con, end, size, &con->actual_peer_addr);
+	ret = read_partial(con, end, size, &con->v1.actual_peer_addr);
 	if (ret <= 0)
 		goto out;
-	ceph_decode_banner_addr(&con->actual_peer_addr);
+	ceph_decode_banner_addr(&con->v1.actual_peer_addr);
 
-	size = sizeof (con->peer_addr_for_me);
+	size = sizeof(con->v1.peer_addr_for_me);
 	end += size;
-	ret = read_partial(con, end, size, &con->peer_addr_for_me);
+	ret = read_partial(con, end, size, &con->v1.peer_addr_for_me);
 	if (ret <= 0)
 		goto out;
-	ceph_decode_banner_addr(&con->peer_addr_for_me);
+	ceph_decode_banner_addr(&con->v1.peer_addr_for_me);
 
 out:
 	return ret;
@@ -656,34 +658,34 @@ static int read_partial_connect(struct ceph_connection *con)
 	int end;
 	int ret;
 
-	dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+	dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos);
 
-	size = sizeof (con->in_reply);
+	size = sizeof(con->v1.in_reply);
 	end = size;
-	ret = read_partial(con, end, size, &con->in_reply);
+	ret = read_partial(con, end, size, &con->v1.in_reply);
 	if (ret <= 0)
 		goto out;
 
-	if (con->auth) {
-		size = le32_to_cpu(con->in_reply.authorizer_len);
-		if (size > con->auth->authorizer_reply_buf_len) {
+	if (con->v1.auth) {
+		size = le32_to_cpu(con->v1.in_reply.authorizer_len);
+		if (size > con->v1.auth->authorizer_reply_buf_len) {
 			pr_err("authorizer reply too big: %d > %zu\n", size,
-			       con->auth->authorizer_reply_buf_len);
+			       con->v1.auth->authorizer_reply_buf_len);
 			ret = -EINVAL;
 			goto out;
 		}
 
 		end += size;
 		ret = read_partial(con, end, size,
-				   con->auth->authorizer_reply_buf);
+				   con->v1.auth->authorizer_reply_buf);
 		if (ret <= 0)
 			goto out;
 	}
 
 	dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
-	     con, (int)con->in_reply.tag,
-	     le32_to_cpu(con->in_reply.connect_seq),
-	     le32_to_cpu(con->in_reply.global_seq));
+	     con, con->v1.in_reply.tag,
+	     le32_to_cpu(con->v1.in_reply.connect_seq),
+	     le32_to_cpu(con->v1.in_reply.global_seq));
 out:
 	return ret;
 }
@@ -693,7 +695,7 @@ out:
  */
 static int verify_hello(struct ceph_connection *con)
 {
-	if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+	if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
 		pr_err("connect to %s got bad banner\n",
 		       ceph_pr_addr(&con->peer_addr));
 		con->error_msg = "protocol error, bad banner";
@@ -716,15 +718,15 @@ static int process_banner(struct ceph_connection *con)
 	 * end may not yet know their ip address, so if it's 0.0.0.0, give
 	 * them the benefit of the doubt.
 	 */
-	if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+	if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr,
 		   sizeof(con->peer_addr)) != 0 &&
-	    !(ceph_addr_is_blank(&con->actual_peer_addr) &&
-	      con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+	    !(ceph_addr_is_blank(&con->v1.actual_peer_addr) &&
+	      con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) {
 		pr_warn("wrong peer, want %s/%u, got %s/%u\n",
 			ceph_pr_addr(&con->peer_addr),
 			le32_to_cpu(con->peer_addr.nonce),
-			ceph_pr_addr(&con->actual_peer_addr),
-			le32_to_cpu(con->actual_peer_addr.nonce));
+			ceph_pr_addr(&con->v1.actual_peer_addr),
+			le32_to_cpu(con->v1.actual_peer_addr.nonce));
 		con->error_msg = "wrong peer at address";
 		return -1;
 	}
@@ -734,8 +736,8 @@ static int process_banner(struct ceph_connection *con)
 	 */
 	if (ceph_addr_is_blank(my_addr)) {
 		memcpy(&my_addr->in_addr,
-		       &con->peer_addr_for_me.in_addr,
-		       sizeof(con->peer_addr_for_me.in_addr));
+		       &con->v1.peer_addr_for_me.in_addr,
+		       sizeof(con->v1.peer_addr_for_me.in_addr));
 		ceph_addr_set_port(my_addr, 0);
 		ceph_encode_my_addr(con->msgr);
 		dout("process_banner learned my addr is %s\n",
@@ -749,13 +751,13 @@ static int process_connect(struct ceph_connection *con)
 {
 	u64 sup_feat = from_msgr(con->msgr)->supported_features;
 	u64 req_feat = from_msgr(con->msgr)->required_features;
-	u64 server_feat = le64_to_cpu(con->in_reply.features);
+	u64 server_feat = le64_to_cpu(con->v1.in_reply.features);
 	int ret;
 
-	dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+	dout("process_connect on %p tag %d\n", con, con->v1.in_tag);
 
-	if (con->auth) {
-		int len = le32_to_cpu(con->in_reply.authorizer_len);
+	if (con->v1.auth) {
+		int len = le32_to_cpu(con->v1.in_reply.authorizer_len);
 
 		/*
 		 * Any connection that defines ->get_authorizer()
@@ -764,9 +766,10 @@ static int process_connect(struct ceph_connection *con)
 		 *
 		 * See get_connect_authorizer().
 		 */
-		if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+		if (con->v1.in_reply.tag ==
+				CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
 			ret = con->ops->add_authorizer_challenge(
-				    con, con->auth->authorizer_reply_buf, len);
+				con, con->v1.auth->authorizer_reply_buf, len);
 			if (ret < 0)
 				return ret;
 
@@ -785,7 +788,7 @@ static int process_connect(struct ceph_connection *con)
 		}
 	}
 
-	switch (con->in_reply.tag) {
+	switch (con->v1.in_reply.tag) {
 	case CEPH_MSGR_TAG_FEATURES:
 		pr_err("%s%lld %s feature set mismatch,"
 		       " my %llx < server's %llx, missing %llx\n",
@@ -800,16 +803,16 @@ static int process_connect(struct ceph_connection *con)
 		       " my %d != server's %d\n",
 		       ENTITY_NAME(con->peer_name),
 		       ceph_pr_addr(&con->peer_addr),
-		       le32_to_cpu(con->out_connect.protocol_version),
-		       le32_to_cpu(con->in_reply.protocol_version));
+		       le32_to_cpu(con->v1.out_connect.protocol_version),
+		       le32_to_cpu(con->v1.in_reply.protocol_version));
 		con->error_msg = "protocol version mismatch";
 		return -1;
 
 	case CEPH_MSGR_TAG_BADAUTHORIZER:
-		con->auth_retry++;
+		con->v1.auth_retry++;
 		dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
-		     con->auth_retry);
-		if (con->auth_retry == 2) {
+		     con->v1.auth_retry);
+		if (con->v1.auth_retry == 2) {
 			con->error_msg = "connect authorization failure";
 			return -1;
 		}
@@ -829,7 +832,7 @@ static int process_connect(struct ceph_connection *con)
 		 * dropped messages.
 		 */
 		dout("process_connect got RESET peer seq %u\n",
-		     le32_to_cpu(con->in_reply.connect_seq));
+		     le32_to_cpu(con->v1.in_reply.connect_seq));
 		pr_info("%s%lld %s session reset\n",
 			ENTITY_NAME(con->peer_name),
 			ceph_pr_addr(&con->peer_addr));
@@ -855,9 +858,9 @@ static int process_connect(struct ceph_connection *con)
 		 * again with a larger value.
 		 */
 		dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
-		     le32_to_cpu(con->out_connect.connect_seq),
-		     le32_to_cpu(con->in_reply.connect_seq));
-		con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
+		     le32_to_cpu(con->v1.out_connect.connect_seq),
+		     le32_to_cpu(con->v1.in_reply.connect_seq));
+		con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq);
 		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
@@ -871,10 +874,10 @@ static int process_connect(struct ceph_connection *con)
 		 * again with a larger value.
 		 */
 		dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.global_seq));
+		     con->v1.peer_global_seq,
+		     le32_to_cpu(con->v1.in_reply.global_seq));
 		ceph_get_global_seq(con->msgr,
-				    le32_to_cpu(con->in_reply.global_seq));
+				    le32_to_cpu(con->v1.in_reply.global_seq));
 		con_out_kvec_reset(con);
 		ret = prepare_write_connect(con);
 		if (ret < 0)
@@ -896,23 +899,24 @@ static int process_connect(struct ceph_connection *con)
 
 		WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
 		con->state = CEPH_CON_S_OPEN;
-		con->auth_retry = 0;    /* we authenticated; clear flag */
-		con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
-		con->connect_seq++;
+		con->v1.auth_retry = 0;    /* we authenticated; clear flag */
+		con->v1.peer_global_seq =
+			le32_to_cpu(con->v1.in_reply.global_seq);
+		con->v1.connect_seq++;
 		con->peer_features = server_feat;
 		dout("process_connect got READY gseq %d cseq %d (%d)\n",
-		     con->peer_global_seq,
-		     le32_to_cpu(con->in_reply.connect_seq),
-		     con->connect_seq);
-		WARN_ON(con->connect_seq !=
-			le32_to_cpu(con->in_reply.connect_seq));
+		     con->v1.peer_global_seq,
+		     le32_to_cpu(con->v1.in_reply.connect_seq),
+		     con->v1.connect_seq);
+		WARN_ON(con->v1.connect_seq !=
+			le32_to_cpu(con->v1.in_reply.connect_seq));
 
-		if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+		if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
 			ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
 
 		con->delay = 0;      /* reset backoff memory */
 
-		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+		if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) {
 			prepare_write_seq(con);
 			prepare_read_seq(con);
 		} else {
@@ -942,10 +946,10 @@ static int process_connect(struct ceph_connection *con)
  */
 static int read_partial_ack(struct ceph_connection *con)
 {
-	int size = sizeof (con->in_temp_ack);
+	int size = sizeof(con->v1.in_temp_ack);
 	int end = size;
 
-	return read_partial(con, end, size, &con->in_temp_ack);
+	return read_partial(con, end, size, &con->v1.in_temp_ack);
 }
 
 /*
@@ -953,9 +957,9 @@ static int read_partial_ack(struct ceph_connection *con)
  */
 static void process_ack(struct ceph_connection *con)
 {
-	u64 ack = le64_to_cpu(con->in_temp_ack);
+	u64 ack = le64_to_cpu(con->v1.in_temp_ack);
 
-	if (con->in_tag == CEPH_MSGR_TAG_ACK)
+	if (con->v1.in_tag == CEPH_MSGR_TAG_ACK)
 		ceph_con_discard_sent(con, ack);
 	else
 		ceph_con_discard_requeued(con, ack);
@@ -1045,39 +1049,39 @@ static int read_partial_message(struct ceph_connection *con)
 	dout("read_partial_message con %p msg %p\n", con, m);
 
 	/* header */
-	size = sizeof (con->in_hdr);
+	size = sizeof(con->v1.in_hdr);
 	end = size;
-	ret = read_partial(con, end, size, &con->in_hdr);
+	ret = read_partial(con, end, size, &con->v1.in_hdr);
 	if (ret <= 0)
 		return ret;
 
-	crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
-	if (cpu_to_le32(crc) != con->in_hdr.crc) {
+	crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc));
+	if (cpu_to_le32(crc) != con->v1.in_hdr.crc) {
 		pr_err("read_partial_message bad hdr crc %u != expected %u\n",
-		       crc, con->in_hdr.crc);
+		       crc, con->v1.in_hdr.crc);
 		return -EBADMSG;
 	}
 
-	front_len = le32_to_cpu(con->in_hdr.front_len);
+	front_len = le32_to_cpu(con->v1.in_hdr.front_len);
 	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
 		return -EIO;
-	middle_len = le32_to_cpu(con->in_hdr.middle_len);
+	middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
 	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
 		return -EIO;
-	data_len = le32_to_cpu(con->in_hdr.data_len);
+	data_len = le32_to_cpu(con->v1.in_hdr.data_len);
 	if (data_len > CEPH_MSG_MAX_DATA_LEN)
 		return -EIO;
 
 	/* verify seq# */
-	seq = le64_to_cpu(con->in_hdr.seq);
+	seq = le64_to_cpu(con->v1.in_hdr.seq);
 	if ((s64)seq - (s64)con->in_seq < 1) {
 		pr_info("skipping %s%lld %s seq %lld expected %lld\n",
 			ENTITY_NAME(con->peer_name),
 			ceph_pr_addr(&con->peer_addr),
 			seq, con->in_seq + 1);
-		con->in_base_pos = -front_len - middle_len - data_len -
-			sizeof_footer(con);
-		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->v1.in_base_pos = -front_len - middle_len - data_len -
+				      sizeof_footer(con);
+		con->v1.in_tag = CEPH_MSGR_TAG_READY;
 		return 1;
 	} else if ((s64)seq - (s64)con->in_seq > 1) {
 		pr_err("read_partial_message bad seq %lld expected %lld\n",
@@ -1090,9 +1094,9 @@ static int read_partial_message(struct ceph_connection *con)
 	if (!con->in_msg) {
 		int skip = 0;
 
-		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+		dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type,
 		     front_len, data_len);
-		ret = ceph_con_in_msg_alloc(con, &con->in_hdr, &skip);
+		ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip);
 		if (ret < 0)
 			return ret;
 
@@ -1100,9 +1104,9 @@ static int read_partial_message(struct ceph_connection *con)
 		if (skip) {
 			/* skip this message */
 			dout("alloc_msg said skip message\n");
-			con->in_base_pos = -front_len - middle_len - data_len -
-				sizeof_footer(con);
-			con->in_tag = CEPH_MSGR_TAG_READY;
+			con->v1.in_base_pos = -front_len - middle_len -
+					      data_len - sizeof_footer(con);
+			con->v1.in_tag = CEPH_MSGR_TAG_READY;
 			con->in_seq++;
 			return 1;
 		}
@@ -1214,8 +1218,8 @@ more:
 
 	BUG_ON(!con->sock);
 
-	dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
-	     con->in_base_pos);
+	dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag,
+	     con->v1.in_base_pos);
 
 	if (con->state == CEPH_CON_S_V1_BANNER) {
 		ret = read_partial_banner(con);
@@ -1253,27 +1257,27 @@ more:
 
 	WARN_ON(con->state != CEPH_CON_S_OPEN);
 
-	if (con->in_base_pos < 0) {
+	if (con->v1.in_base_pos < 0) {
 		/*
 		 * skipping + discarding content.
 		 */
-		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
+		ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos);
 		if (ret <= 0)
 			goto out;
-		dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
-		con->in_base_pos += ret;
-		if (con->in_base_pos)
+		dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos);
+		con->v1.in_base_pos += ret;
+		if (con->v1.in_base_pos)
 			goto more;
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_READY) {
+	if (con->v1.in_tag == CEPH_MSGR_TAG_READY) {
 		/*
 		 * what's next?
 		 */
-		ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+		ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1);
 		if (ret <= 0)
 			goto out;
-		dout("try_read got tag %d\n", (int)con->in_tag);
-		switch (con->in_tag) {
+		dout("try_read got tag %d\n", con->v1.in_tag);
+		switch (con->v1.in_tag) {
 		case CEPH_MSGR_TAG_MSG:
 			prepare_read_message(con);
 			break;
@@ -1291,7 +1295,7 @@ more:
 			goto bad_tag;
 		}
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+	if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) {
 		ret = read_partial_message(con);
 		if (ret <= 0) {
 			switch (ret) {
@@ -1307,15 +1311,15 @@ more:
 			}
 			goto out;
 		}
-		if (con->in_tag == CEPH_MSGR_TAG_READY)
+		if (con->v1.in_tag == CEPH_MSGR_TAG_READY)
 			goto more;
 		ceph_con_process_message(con);
 		if (con->state == CEPH_CON_S_OPEN)
 			prepare_read_tag(con);
 		goto more;
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
-	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
+	if (con->v1.in_tag == CEPH_MSGR_TAG_ACK ||
+	    con->v1.in_tag == CEPH_MSGR_TAG_SEQ) {
 		/*
 		 * the final handshake seq exchange is semantically
 		 * equivalent to an ACK
@@ -1326,7 +1330,7 @@ more:
 		process_ack(con);
 		goto more;
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+	if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
 		ret = read_keepalive_ack(con);
 		if (ret <= 0)
 			goto out;
@@ -1338,7 +1342,7 @@ out:
 	return ret;
 
 bad_tag:
-	pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+	pr_err("try_read bad tag %d\n", con->v1.in_tag);
 	con->error_msg = "protocol error, garbage tag";
 	ret = -1;
 	goto out;
@@ -1369,7 +1373,7 @@ int ceph_con_v1_try_write(struct ceph_connection *con)
 		prepare_read_banner(con);
 
 		BUG_ON(con->in_msg);
-		con->in_tag = CEPH_MSGR_TAG_READY;
+		con->v1.in_tag = CEPH_MSGR_TAG_READY;
 		dout("try_write initiating connect on %p new state %d\n",
 		     con, con->state);
 		ret = ceph_tcp_connect(con);
@@ -1380,16 +1384,16 @@ int ceph_con_v1_try_write(struct ceph_connection *con)
 	}
 
 more:
-	dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+	dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes);
 	BUG_ON(!con->sock);
 
 	/* kvec data queued? */
-	if (con->out_kvec_left) {
+	if (con->v1.out_kvec_left) {
 		ret = write_partial_kvec(con);
 		if (ret <= 0)
 			goto out;
 	}
-	if (con->out_skip) {
+	if (con->v1.out_skip) {
 		ret = write_partial_skip(con);
 		if (ret <= 0)
 			goto out;
@@ -1397,7 +1401,7 @@ more:
 
 	/* msg pages? */
 	if (con->out_msg) {
-		if (con->out_msg_done) {
+		if (con->v1.out_msg_done) {
 			ceph_msg_put(con->out_msg);
 			con->out_msg = NULL;   /* we're done with this one */
 			goto do_next;
@@ -1446,57 +1450,57 @@ void ceph_con_v1_revoke(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->out_msg;
 
-	WARN_ON(con->out_skip);
+	WARN_ON(con->v1.out_skip);
 	/* footer */
-	if (con->out_msg_done) {
-		con->out_skip += con_out_kvec_skip(con);
+	if (con->v1.out_msg_done) {
+		con->v1.out_skip += con_out_kvec_skip(con);
 	} else {
 		WARN_ON(!msg->data_length);
-		con->out_skip += sizeof_footer(con);
+		con->v1.out_skip += sizeof_footer(con);
 	}
 	/* data, middle, front */
 	if (msg->data_length)
-		con->out_skip += msg->cursor.total_resid;
+		con->v1.out_skip += msg->cursor.total_resid;
 	if (msg->middle)
-		con->out_skip += con_out_kvec_skip(con);
-	con->out_skip += con_out_kvec_skip(con);
+		con->v1.out_skip += con_out_kvec_skip(con);
+	con->v1.out_skip += con_out_kvec_skip(con);
 
 	dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
-	     con->out_kvec_bytes, con->out_skip);
+	     con->v1.out_kvec_bytes, con->v1.out_skip);
 }
 
 void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
 {
-	unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
-	unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
+	unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+	unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+	unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len);
 
 	/* skip rest of message */
-	con->in_base_pos = con->in_base_pos -
+	con->v1.in_base_pos = con->v1.in_base_pos -
 			sizeof(struct ceph_msg_header) -
 			front_len -
 			middle_len -
 			data_len -
 			sizeof(struct ceph_msg_footer);
 
-	con->in_tag = CEPH_MSGR_TAG_READY;
+	con->v1.in_tag = CEPH_MSGR_TAG_READY;
 	con->in_seq++;
 
-	dout("%s con %p in_base_pos %d\n", __func__, con, con->in_base_pos);
+	dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos);
 }
 
 bool ceph_con_v1_opened(struct ceph_connection *con)
 {
-	return con->connect_seq;
+	return con->v1.connect_seq;
 }
 
 void ceph_con_v1_reset_session(struct ceph_connection *con)
 {
-	con->connect_seq = 0;
-	con->peer_global_seq = 0;
+	con->v1.connect_seq = 0;
+	con->v1.peer_global_seq = 0;
 }
 
 void ceph_con_v1_reset_protocol(struct ceph_connection *con)
 {
-	con->out_skip = 0;
+	con->v1.out_skip = 0;
 }
-- 
cgit v1.2.3


From 285ea34fc876aa0a2c5e65d310c4a41269e2e5f2 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 26 Oct 2020 16:47:20 +0100
Subject: libceph, ceph: incorporate nautilus cephx changes

- request service tickets together with auth ticket.  Currently we get
  auth ticket via CEPHX_GET_AUTH_SESSION_KEY op and then request service
  tickets via CEPHX_GET_PRINCIPAL_SESSION_KEY op in a separate message.
  Since nautilus, desired service tickets are shared togther with auth
  ticket in CEPHX_GET_AUTH_SESSION_KEY reply.

- propagate session key and connection secret, if any.  In preparation
  for msgr2, update handle_reply() and verify_authorizer_reply() auth
  ops to propagate session key and connection secret.  Since nautilus,
  if secure mode is negotiated, connection secret is shared either in
  CEPHX_GET_AUTH_SESSION_KEY reply (for mons) or in a final authorizer
  reply (for osds and mdses).

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c       |   5 +-
 include/linux/ceph/auth.h  |  16 +++-
 net/ceph/auth.c            |  12 ++-
 net/ceph/auth_none.c       |   4 +-
 net/ceph/auth_x.c          | 215 ++++++++++++++++++++++++++++++++++++---------
 net/ceph/auth_x_protocol.h |   3 +-
 net/ceph/crypto.h          |   3 +
 net/ceph/osd_client.c      |   5 +-
 8 files changed, 210 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a256d95ec99a..278fe67e2617 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5178,8 +5178,11 @@ static int verify_authorizer_reply(struct ceph_connection *con)
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
 
-	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer);
+	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+		auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+		NULL, NULL, NULL, NULL);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 6728c2ee0205..d9e7d0bcdaf1 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -53,7 +53,9 @@ struct ceph_auth_client_ops {
 	 */
 	int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
 	int (*handle_reply)(struct ceph_auth_client *ac, int result,
-			    void *buf, void *end);
+			    void *buf, void *end, u8 *session_key,
+			    int *session_key_len, u8 *con_secret,
+			    int *con_secret_len);
 
 	/*
 	 * Create authorizer for connecting to a service, and verify
@@ -69,7 +71,10 @@ struct ceph_auth_client_ops {
 					void *challenge_buf,
 					int challenge_buf_len);
 	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
-				       struct ceph_authorizer *a);
+				       struct ceph_authorizer *a,
+				       void *reply, int reply_len,
+				       u8 *session_key, int *session_key_len,
+				       u8 *con_secret, int *con_secret_len);
 	void (*invalidate_authorizer)(struct ceph_auth_client *ac,
 				      int peer_type);
 
@@ -126,8 +131,11 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a,
 				       void *challenge_buf,
 				       int challenge_buf_len);
-extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
-					     struct ceph_authorizer *a);
+int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a,
+				      void *reply, int reply_len,
+				      u8 *session_key, int *session_key_len,
+				      u8 *con_secret, int *con_secret_len);
 extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac,
 					    int peer_type);
 
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index fbeee068ea14..40d3d95344d9 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -240,7 +240,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 		ac->negotiating = false;
 	}
 
-	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+	ret = ac->ops->handle_reply(ac, result, payload, payload_end,
+				    NULL, NULL, NULL, NULL);
 	if (ret == -EAGAIN) {
 		ret = ceph_build_auth_request(ac, reply_buf, reply_len);
 	} else if (ret) {
@@ -332,13 +333,18 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge);
 
 int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
-				      struct ceph_authorizer *a)
+				      struct ceph_authorizer *a,
+				      void *reply, int reply_len,
+				      u8 *session_key, int *session_key_len,
+				      u8 *con_secret, int *con_secret_len)
 {
 	int ret = 0;
 
 	mutex_lock(&ac->mutex);
 	if (ac->ops && ac->ops->verify_authorizer_reply)
-		ret = ac->ops->verify_authorizer_reply(ac, a);
+		ret = ac->ops->verify_authorizer_reply(ac, a,
+			reply, reply_len, session_key, session_key_len,
+			con_secret, con_secret_len);
 	mutex_unlock(&ac->mutex);
 	return ret;
 }
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index edb7042479ed..af8ae507e861 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -70,7 +70,9 @@ static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
  * authenticate state, so nothing happens here.
  */
 static int handle_reply(struct ceph_auth_client *ac, int result,
-			void *buf, void *end)
+			void *buf, void *end, u8 *session_key,
+			int *session_key_len, u8 *con_secret,
+			int *con_secret_len)
 {
 	struct ceph_auth_none_info *xi = ac->private;
 
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 425508d4dafd..a265792642dc 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -269,22 +269,21 @@ out:
 
 static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
 				    struct ceph_crypto_key *secret,
-				    void *buf, void *end)
+				    void **p, void *end)
 {
-	void *p = buf;
 	u8 reply_struct_v;
 	u32 num;
 	int ret;
 
-	ceph_decode_8_safe(&p, end, reply_struct_v, bad);
+	ceph_decode_8_safe(p, end, reply_struct_v, bad);
 	if (reply_struct_v != 1)
 		return -EINVAL;
 
-	ceph_decode_32_safe(&p, end, num, bad);
+	ceph_decode_32_safe(p, end, num, bad);
 	dout("%d tickets\n", num);
 
 	while (num--) {
-		ret = process_one_ticket(ac, secret, &p, end);
+		ret = process_one_ticket(ac, secret, p, end);
 		if (ret)
 			return ret;
 	}
@@ -527,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 		if (ret < 0)
 			return ret;
 
-		auth->struct_v = 1;
+		auth->struct_v = 2;  /* nautilus+ */
 		auth->key = 0;
 		for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
 			auth->key ^= *(__le64 *)u;
@@ -540,6 +539,10 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 		if (ret < 0)
 			return ret;
 
+		/* nautilus+: request service tickets at the same time */
+		need = ac->want_keys & ~CEPH_ENTITY_TYPE_AUTH;
+		WARN_ON(!need);
+		ceph_encode_32_safe(&p, end, need, e_range);
 		return p - buf;
 	}
 
@@ -566,8 +569,82 @@ e_range:
 	return -ERANGE;
 }
 
+static int handle_auth_session_key(struct ceph_auth_client *ac,
+				   void **p, void *end,
+				   u8 *session_key, int *session_key_len,
+				   u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_x_info *xi = ac->private;
+	struct ceph_x_ticket_handler *th;
+	void *dp, *dend;
+	int len;
+	int ret;
+
+	/* AUTH ticket */
+	ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end);
+	if (ret)
+		return ret;
+
+	if (*p == end) {
+		/* pre-nautilus (or didn't request service tickets!) */
+		WARN_ON(session_key || con_secret);
+		return 0;
+	}
+
+	th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	if (session_key) {
+		memcpy(session_key, th->session_key.key, th->session_key.len);
+		*session_key_len = th->session_key.len;
+	}
+
+	/* connection secret */
+	ceph_decode_32_safe(p, end, len, e_inval);
+	dout("%s connection secret blob len %d\n", __func__, len);
+	if (len > 0) {
+		dp = *p + ceph_x_encrypt_offset();
+		ret = ceph_x_decrypt(&th->session_key, p, *p + len);
+		if (ret < 0)
+			return ret;
+
+		dout("%s decrypted %d bytes\n", __func__, ret);
+		dend = dp + ret;
+
+		ceph_decode_32_safe(&dp, dend, len, e_inval);
+		if (len > CEPH_MAX_CON_SECRET_LEN) {
+			pr_err("connection secret too big %d\n", len);
+			return -EINVAL;
+		}
+
+		dout("%s connection secret len %d\n", __func__, len);
+		if (con_secret) {
+			memcpy(con_secret, dp, len);
+			*con_secret_len = len;
+		}
+	}
+
+	/* service tickets */
+	ceph_decode_32_safe(p, end, len, e_inval);
+	dout("%s service tickets blob len %d\n", __func__, len);
+	if (len > 0) {
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+					       p, *p + len);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
 static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
-			       void *buf, void *end)
+			       void *buf, void *end,
+			       u8 *session_key, int *session_key_len,
+			       u8 *con_secret, int *con_secret_len)
 {
 	struct ceph_x_info *xi = ac->private;
 	struct ceph_x_ticket_handler *th;
@@ -599,8 +676,10 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 	dout("handle_reply op %d result %d\n", op, result);
 	switch (op) {
 	case CEPHX_GET_AUTH_SESSION_KEY:
-		/* verify auth key */
-		ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end);
+		/* AUTH ticket + [connection secret] + service tickets */
+		ret = handle_auth_session_key(ac, &p, end, session_key,
+					      session_key_len, con_secret,
+					      con_secret_len);
 		break;
 
 	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
@@ -608,7 +687,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 		if (IS_ERR(th))
 			return PTR_ERR(th);
 
-		ret = ceph_x_proc_ticket_reply(ac, &th->session_key, p, end);
+		/* service tickets */
+		ret = ceph_x_proc_ticket_reply(ac, &th->session_key, &p, end);
 		break;
 
 	default:
@@ -687,40 +767,44 @@ static int ceph_x_update_authorizer(
 	return 0;
 }
 
-static int decrypt_authorize_challenge(struct ceph_x_authorizer *au,
-				       void *challenge_buf,
-				       int challenge_buf_len,
-				       u64 *server_challenge)
+/*
+ * CephXAuthorizeChallenge
+ */
+static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret,
+					void *challenge, int challenge_len,
+					u64 *server_challenge)
 {
-	struct ceph_x_authorize_challenge *ch =
-	    challenge_buf + sizeof(struct ceph_x_encrypt_header);
+	void *dp, *dend;
 	int ret;
 
 	/* no leading len */
-	ret = __ceph_x_decrypt(&au->session_key, challenge_buf,
-			       challenge_buf_len);
+	ret = __ceph_x_decrypt(secret, challenge, challenge_len);
 	if (ret < 0)
 		return ret;
-	if (ret < sizeof(*ch)) {
-		pr_err("bad size %d for ceph_x_authorize_challenge\n", ret);
-		return -EINVAL;
-	}
 
-	*server_challenge = le64_to_cpu(ch->server_challenge);
+	dout("%s decrypted %d bytes\n", __func__, ret);
+	dp = challenge + sizeof(struct ceph_x_encrypt_header);
+	dend = dp + ret;
+
+	ceph_decode_skip_8(&dp, dend, e_inval);  /* struct_v */
+	ceph_decode_64_safe(&dp, dend, *server_challenge, e_inval);
+	dout("%s server_challenge %llu\n", __func__, *server_challenge);
 	return 0;
+
+e_inval:
+	return -EINVAL;
 }
 
 static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
 					   struct ceph_authorizer *a,
-					   void *challenge_buf,
-					   int challenge_buf_len)
+					   void *challenge, int challenge_len)
 {
 	struct ceph_x_authorizer *au = (void *)a;
 	u64 server_challenge;
 	int ret;
 
-	ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len,
-					  &server_challenge);
+	ret = decrypt_authorizer_challenge(&au->session_key, challenge,
+					   challenge_len, &server_challenge);
 	if (ret) {
 		pr_err("failed to decrypt authorize challenge: %d", ret);
 		return ret;
@@ -735,29 +819,76 @@ static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
 	return 0;
 }
 
+/*
+ * CephXAuthorizeReply
+ */
+static int decrypt_authorizer_reply(struct ceph_crypto_key *secret,
+				    void **p, void *end, u64 *nonce_plus_one,
+				    u8 *con_secret, int *con_secret_len)
+{
+	void *dp, *dend;
+	u8 struct_v;
+	int len;
+	int ret;
+
+	dp = *p + ceph_x_encrypt_offset();
+	ret = ceph_x_decrypt(secret, p, end);
+	if (ret < 0)
+		return ret;
+
+	dout("%s decrypted %d bytes\n", __func__, ret);
+	dend = dp + ret;
+
+	ceph_decode_8_safe(&dp, dend, struct_v, e_inval);
+	ceph_decode_64_safe(&dp, dend, *nonce_plus_one, e_inval);
+	dout("%s nonce_plus_one %llu\n", __func__, *nonce_plus_one);
+	if (struct_v >= 2) {
+		ceph_decode_32_safe(&dp, dend, len, e_inval);
+		if (len > CEPH_MAX_CON_SECRET_LEN) {
+			pr_err("connection secret too big %d\n", len);
+			return -EINVAL;
+		}
+
+		dout("%s connection secret len %d\n", __func__, len);
+		if (con_secret) {
+			memcpy(con_secret, dp, len);
+			*con_secret_len = len;
+		}
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
 static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
-					  struct ceph_authorizer *a)
+					  struct ceph_authorizer *a,
+					  void *reply, int reply_len,
+					  u8 *session_key, int *session_key_len,
+					  u8 *con_secret, int *con_secret_len)
 {
 	struct ceph_x_authorizer *au = (void *)a;
-	void *p = au->enc_buf;
-	struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset();
+	u64 nonce_plus_one;
 	int ret;
 
-	ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN);
-	if (ret < 0)
+	if (session_key) {
+		memcpy(session_key, au->session_key.key, au->session_key.len);
+		*session_key_len = au->session_key.len;
+	}
+
+	ret = decrypt_authorizer_reply(&au->session_key, &reply,
+				       reply + reply_len, &nonce_plus_one,
+				       con_secret, con_secret_len);
+	if (ret)
 		return ret;
-	if (ret < sizeof(*reply)) {
-		pr_err("bad size %d for ceph_x_authorize_reply\n", ret);
-		return -EINVAL;
+
+	if (nonce_plus_one != au->nonce + 1) {
+		pr_err("failed to authenticate server\n");
+		return -EPERM;
 	}
 
-	if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one))
-		ret = -EPERM;
-	else
-		ret = 0;
-	dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
-	     au->nonce, le64_to_cpu(reply->nonce_plus_one), ret);
-	return ret;
+	return 0;
 }
 
 static void ceph_x_reset(struct ceph_auth_client *ac)
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
index 24b0b74564d0..792fcb974dc3 100644
--- a/net/ceph/auth_x_protocol.h
+++ b/net/ceph/auth_x_protocol.h
@@ -38,7 +38,8 @@ struct ceph_x_authenticate {
 	__u8 struct_v;
 	__le64 client_challenge;
 	__le64 key;
-	/* ticket blob */
+	/* old_ticket blob */
+	/* nautilus+: other_keys */
 } __attribute__ ((packed));
 
 struct ceph_x_service_ticket_request {
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index 96ef4d860bc9..13bd526349fa 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -5,6 +5,9 @@
 #include <linux/ceph/types.h>
 #include <linux/ceph/buffer.h>
 
+#define CEPH_KEY_LEN			16
+#define CEPH_MAX_CON_SECRET_LEN		64
+
 /*
  * cryptographic secret
  */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 7901ab6c79fd..8966eae543d3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -5623,8 +5623,11 @@ static int verify_authorizer_reply(struct ceph_connection *con)
 	struct ceph_osd *o = con->private;
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
 
-	return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
+	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+		auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+		NULL, NULL, NULL, NULL);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
-- 
cgit v1.2.3


From 59711f9ec219bf5245a8e95989803fb503adc52d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 26 Oct 2020 17:01:53 +0100
Subject: libceph: amend cephx init_protocol() and build_request()

In msgr2, initial authentication happens with an exchange of msgr2
control frames -- MAuth message and struct ceph_mon_request_header
aren't used.  Make that optional.

Stop reporting cephx protocol as "x".  Use "cephx" instead.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_fs.h |  1 +
 net/ceph/auth.c              | 63 ++++++++++++++++++++++++--------------------
 net/ceph/ceph_strings.c      | 14 ++++++++++
 3 files changed, 50 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index d44d98033d58..6d986e52000b 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -95,6 +95,7 @@ struct ceph_dir_layout {
 
 #define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
 
+const char *ceph_auth_proto_name(int proto);
 
 /*********************************************
  * message layer
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 40d3d95344d9..deaf267f8942 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -21,15 +21,18 @@ static u32 supported_protocols[] = {
 	CEPH_AUTH_CEPHX
 };
 
-static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+static int init_protocol(struct ceph_auth_client *ac, int proto)
 {
-	switch (protocol) {
+	dout("%s proto %d\n", __func__, proto);
+
+	switch (proto) {
 	case CEPH_AUTH_NONE:
 		return ceph_auth_none_init(ac);
 	case CEPH_AUTH_CEPHX:
 		return ceph_x_init(ac);
 	default:
-		return -ENOENT;
+		pr_err("bad auth protocol %d\n", proto);
+		return -EINVAL;
 	}
 }
 
@@ -145,31 +148,35 @@ bad:
 	goto out;
 }
 
-static int ceph_build_auth_request(struct ceph_auth_client *ac,
-				   void *msg_buf, size_t msg_len)
+static int build_request(struct ceph_auth_client *ac, bool add_header,
+			 void *buf, int buf_len)
 {
-	struct ceph_mon_request_header *monhdr = msg_buf;
-	void *p = monhdr + 1;
-	void *end = msg_buf + msg_len;
+	void *end = buf + buf_len;
+	void *p;
 	int ret;
 
-	monhdr->have_version = 0;
-	monhdr->session_mon = cpu_to_le16(-1);
-	monhdr->session_mon_tid = 0;
-
-	ceph_encode_32(&p, ac->protocol);
+	p = buf;
+	if (add_header) {
+		/* struct ceph_mon_request_header + protocol */
+		ceph_encode_64_safe(&p, end, 0, e_range);
+		ceph_encode_16_safe(&p, end, -1, e_range);
+		ceph_encode_64_safe(&p, end, 0, e_range);
+		ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+	}
 
+	ceph_encode_need(&p, end, sizeof(u32), e_range);
 	ret = ac->ops->build_request(ac, p + sizeof(u32), end);
 	if (ret < 0) {
-		pr_err("error %d building auth method %s request\n", ret,
-		       ac->ops->name);
-		goto out;
+		pr_err("auth protocol '%s' building request failed: %d\n",
+		       ceph_auth_proto_name(ac->protocol), ret);
+		return ret;
 	}
 	dout(" built request %d bytes\n", ret);
 	ceph_encode_32(&p, ret);
-	ret = p + ret - msg_buf;
-out:
-	return ret;
+	return p + ret - buf;
+
+e_range:
+	return -ERANGE;
 }
 
 /*
@@ -229,10 +236,10 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 			ac->ops = NULL;
 		}
 		if (ac->protocol != protocol) {
-			ret = ceph_auth_init_protocol(ac, protocol);
+			ret = init_protocol(ac, protocol);
 			if (ret) {
-				pr_err("error %d on auth protocol %d init\n",
-				       ret, protocol);
+				pr_err("auth protocol '%s' init failed: %d\n",
+				       ceph_auth_proto_name(protocol), ret);
 				goto out;
 			}
 		}
@@ -242,11 +249,11 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 
 	ret = ac->ops->handle_reply(ac, result, payload, payload_end,
 				    NULL, NULL, NULL, NULL);
-	if (ret == -EAGAIN) {
-		ret = ceph_build_auth_request(ac, reply_buf, reply_len);
-	} else if (ret) {
-		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
-	}
+	if (ret == -EAGAIN)
+		ret = build_request(ac, true, reply_buf, reply_len);
+	else if (ret)
+		pr_err("auth protocol '%s' mauth authentication failed: %d\n",
+		       ceph_auth_proto_name(ac->protocol), result);
 
 out:
 	mutex_unlock(&ac->mutex);
@@ -265,7 +272,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
 
 	mutex_lock(&ac->mutex);
 	if (ac->ops->should_authenticate(ac))
-		ret = ceph_build_auth_request(ac, msg_buf, msg_len);
+		ret = build_request(ac, true, msg_buf, msg_len);
 	mutex_unlock(&ac->mutex);
 	return ret;
 }
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 10e01494993c..69cd391e02a6 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -18,6 +18,20 @@ const char *ceph_entity_type_name(int type)
 }
 EXPORT_SYMBOL(ceph_entity_type_name);
 
+const char *ceph_auth_proto_name(int proto)
+{
+	switch (proto) {
+	case CEPH_AUTH_UNKNOWN:
+		return "unknown";
+	case CEPH_AUTH_NONE:
+		return "none";
+	case CEPH_AUTH_CEPHX:
+		return "cephx";
+	default:
+		return "???";
+	}
+}
+
 const char *ceph_osd_op_name(int op)
 {
 	switch (op) {
-- 
cgit v1.2.3


From c1c0ce78f479cf4d7dfe72c4c1cabbf0bc0730c9 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 26 Oct 2020 17:05:44 +0100
Subject: libceph: drop ac->ops->name field

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/auth.h | 2 --
 net/ceph/auth_none.c      | 1 -
 net/ceph/auth_x.c         | 1 -
 3 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index d9e7d0bcdaf1..5f64f66309fa 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -32,8 +32,6 @@ struct ceph_auth_handshake {
 };
 
 struct ceph_auth_client_ops {
-	const char *name;
-
 	/*
 	 * true if we are authenticated and can connect to
 	 * services.
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index af8ae507e861..70e86e462250 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -118,7 +118,6 @@ static int ceph_auth_none_create_authorizer(
 }
 
 static const struct ceph_auth_client_ops ceph_auth_none_ops = {
-	.name = "none",
 	.reset = reset,
 	.destroy = destroy,
 	.is_authenticated = is_authenticated,
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index a265792642dc..9815cfe42af0 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -1058,7 +1058,6 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
 }
 
 static const struct ceph_auth_client_ops ceph_x_ops = {
-	.name = "x",
 	.is_authenticated = ceph_x_is_authenticated,
 	.should_authenticate = ceph_x_should_authenticate,
 	.build_request = ceph_x_build_request,
-- 
cgit v1.2.3


From a5cbd5fc22d5043a8a76e15d75d031fe24d1f69c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 30 Oct 2020 13:30:51 +0100
Subject: libceph, ceph: get and handle cluster maps with addrvecs

In preparation for msgr2, make the cluster send us maps with addrvecs
including both LEGACY and MSGR2 addrs instead of a single LEGACY addr.
This means advertising support for SERVER_NAUTILUS and also some older
features: SERVER_MIMIC, MONENC and MONNAMES.

MONNAMES and MONENC are actually pre-argonaut, we just never updated
ceph_monmap_decode() for them.  Decoding is unconditional, see commit
23c625ce3065 ("libceph: assume argonaut on the server side").

SERVER_MIMIC doesn't bear any meaning for the kernel client.

Since ceph_decode_entity_addrvec() is guarded by encoding version
checks (and in msgr2 case it is guarded implicitly by the fact that
server is speaking msgr2), we assume MSG_ADDR2 for it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c               |   2 +-
 fs/ceph/mdsmap.c                   |  21 +++---
 include/linux/ceph/ceph_features.h |  11 ++-
 include/linux/ceph/decode.h        |   4 +
 include/linux/ceph/mdsmap.h        |   2 +-
 include/linux/ceph/osdmap.h        |   4 +-
 net/ceph/decode.c                  |  56 ++++++++++++++
 net/ceph/mon_client.c              | 145 +++++++++++++++++++++++++++----------
 net/ceph/osd_client.c              |   4 +-
 net/ceph/osdmap.c                  |  45 ++++++++----
 10 files changed, 222 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 278fe67e2617..afd22815fbda 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5014,7 +5014,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 		return;
 	}
 
-	newmap = ceph_mdsmap_decode(&p, end);
+	newmap = ceph_mdsmap_decode(&p, end, false);
 	if (IS_ERR(newmap)) {
 		err = PTR_ERR(newmap);
 		goto bad_unlock;
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 1096d1d3a84c..abd9af7727ad 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -114,7 +114,7 @@ bad:
  * Ignore any fields we don't care about (there are quite a few of
  * them).
  */
-struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2)
 {
 	struct ceph_mdsmap *m;
 	const void *start = *p;
@@ -201,18 +201,19 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 		namelen = ceph_decode_32(p);  /* skip mds name */
 		*p += namelen;
 
-		ceph_decode_need(p, end,
-				 4*sizeof(u32) + sizeof(u64) +
-				 sizeof(addr) + sizeof(struct ceph_timespec),
-				 bad);
-		mds = ceph_decode_32(p);
-		inc = ceph_decode_32(p);
-		state = ceph_decode_32(p);
+		ceph_decode_32_safe(p, end, mds, bad);
+		ceph_decode_32_safe(p, end, inc, bad);
+		ceph_decode_32_safe(p, end, state, bad);
 		*p += sizeof(u64);		/* state_seq */
-		err = ceph_decode_entity_addr(p, end, &addr);
+		if (info_v >= 8)
+			err = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+		else
+			err = ceph_decode_entity_addr(p, end, &addr);
 		if (err)
 			goto corrupt;
-		ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
+
+		ceph_decode_copy_safe(p, end, &laggy_since, sizeof(laggy_since),
+				      bad);
 		laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0;
 		*p += sizeof(u32);
 		ceph_decode_32_safe(p, end, namelen, bad);
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 999636d53cf2..3a47acd9cc14 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -8,7 +8,8 @@
  * feature.  Base case is 1 (first use).
  */
 #define CEPH_FEATURE_INCARNATION_1 (0ull)
-#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_2 (1ull<<57)              // SERVER_JEWEL
+#define CEPH_FEATURE_INCARNATION_3 ((1ull<<57)|(1ull<<28)) // SERVER_MIMIC
 
 #define DEFINE_CEPH_FEATURE(bit, incarnation, name)			\
 	static const uint64_t __maybe_unused CEPH_FEATURE_##name = (1ULL<<bit);		\
@@ -75,7 +76,7 @@
 DEFINE_CEPH_FEATURE( 0, 1, UID)
 DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)
 DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
-
+DEFINE_CEPH_FEATURE( 2, 3, SERVER_NAUTILUS)
 DEFINE_CEPH_FEATURE( 3, 1, FLOCK)
 DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)
 DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
@@ -114,7 +115,7 @@ DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)
 DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
 DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE)
 DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
-DEFINE_CEPH_FEATURE(28, 2, SERVER_M)
+DEFINE_CEPH_FEATURE(28, 2, SERVER_MIMIC)
 DEFINE_CEPH_FEATURE(29, 1, MDSENC)
 DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)
 DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS)  // deprecate me
@@ -177,13 +178,16 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT		\
 	(CEPH_FEATURE_NOSRCADDR |		\
+	 CEPH_FEATURE_SERVER_NAUTILUS |		\
 	 CEPH_FEATURE_FLOCK |			\
 	 CEPH_FEATURE_SUBSCRIBE2 |		\
+	 CEPH_FEATURE_MONNAMES |		\
 	 CEPH_FEATURE_RECONNECT_SEQ |		\
 	 CEPH_FEATURE_DIRLAYOUTHASH |		\
 	 CEPH_FEATURE_PGID64 |			\
 	 CEPH_FEATURE_PGPOOL3 |			\
 	 CEPH_FEATURE_OSDENC |			\
+	 CEPH_FEATURE_MONENC |			\
 	 CEPH_FEATURE_CRUSH_TUNABLES |		\
 	 CEPH_FEATURE_SERVER_LUMINOUS |		\
 	 CEPH_FEATURE_RESEND_ON_SPLIT |		\
@@ -193,6 +197,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_MSG_AUTH |		\
 	 CEPH_FEATURE_CRUSH_TUNABLES2 |		\
 	 CEPH_FEATURE_REPLY_CREATE_INODE |	\
+	 CEPH_FEATURE_SERVER_MIMIC |		\
 	 CEPH_FEATURE_MDSENC |			\
 	 CEPH_FEATURE_OSDHASHPSPOOL |		\
 	 CEPH_FEATURE_OSD_CACHEPOOL |		\
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 450384fe487c..9a934e04f841 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -220,6 +220,7 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
  */
 #define CEPH_ENTITY_ADDR_TYPE_NONE	0
 #define CEPH_ENTITY_ADDR_TYPE_LEGACY	__cpu_to_le32(1)
+#define CEPH_ENTITY_ADDR_TYPE_MSGR2	__cpu_to_le32(2)
 
 static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
 {
@@ -239,6 +240,9 @@ static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a)
 
 extern int ceph_decode_entity_addr(void **p, void *end,
 				   struct ceph_entity_addr *addr);
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+			       struct ceph_entity_addr *addr);
+
 /*
  * encoders
  */
diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 35d385296fbb..523fd0452856 100644
--- a/include/linux/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
@@ -64,7 +64,7 @@ static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
 }
 
 extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
-extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
+struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2);
 extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
 extern bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m);
 
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index cad9acfbc320..5553019c3f07 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -251,8 +251,8 @@ static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
 }
 
 struct ceph_osdmap *ceph_osdmap_alloc(void);
-extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2);
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
 					     struct ceph_osdmap *map);
 extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
 
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index eea529595a7a..6429b6713507 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/ceph/decode.h>
 
@@ -82,3 +83,58 @@ bad:
 }
 EXPORT_SYMBOL(ceph_decode_entity_addr);
 
+/*
+ * Return addr of desired type (MSGR2 or LEGACY) or error.
+ * Make sure there is only one match.
+ *
+ * Assume encoding with MSG_ADDR2.
+ */
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+			       struct ceph_entity_addr *addr)
+{
+	__le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 :
+				 CEPH_ENTITY_ADDR_TYPE_LEGACY;
+	struct ceph_entity_addr tmp_addr;
+	int addr_cnt;
+	bool found;
+	u8 marker;
+	int ret;
+	int i;
+
+	ceph_decode_8_safe(p, end, marker, e_inval);
+	if (marker != 2) {
+		pr_err("bad addrvec marker %d\n", marker);
+		return -EINVAL;
+	}
+
+	ceph_decode_32_safe(p, end, addr_cnt, e_inval);
+
+	found = false;
+	for (i = 0; i < addr_cnt; i++) {
+		ret = ceph_decode_entity_addr(p, end, &tmp_addr);
+		if (ret)
+			return ret;
+
+		if (tmp_addr.type == my_type) {
+			if (found) {
+				pr_err("another match of type %d in addrvec\n",
+				       le32_to_cpu(my_type));
+				return -EINVAL;
+			}
+
+			memcpy(addr, &tmp_addr, sizeof(*addr));
+			found = true;
+		}
+	}
+	if (!found && addr_cnt != 0) {
+		pr_err("no match of type %d in addrvec\n",
+		       le32_to_cpu(my_type));
+		return -ENOENT;
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addrvec);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index ebfecf8d0918..a9754a7fa78c 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -36,57 +36,122 @@ static const struct ceph_connection_operations mon_con_ops;
 
 static int __validate_auth(struct ceph_mon_client *monc);
 
+static int decode_mon_info(void **p, void *end, bool msgr2,
+			   struct ceph_entity_addr *addr)
+{
+	void *mon_info_end;
+	u32 struct_len;
+	u8 struct_v;
+	int ret;
+
+	ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v,
+				  &struct_len);
+	if (ret)
+		return ret;
+
+	mon_info_end = *p + struct_len;
+	ceph_decode_skip_string(p, end, e_inval);  /* skip mon name */
+	ret = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+	if (ret)
+		return ret;
+
+	*p = mon_info_end;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
 /*
  * Decode a monmap blob (e.g., during mount).
+ *
+ * Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC).
  */
-static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2)
 {
-	struct ceph_monmap *m = NULL;
-	int i, err = -EINVAL;
+	struct ceph_monmap *monmap = NULL;
 	struct ceph_fsid fsid;
-	u32 epoch, num_mon;
-	u32 len;
+	u32 struct_len;
+	int blob_len;
+	int num_mon;
+	u8 struct_v;
+	u32 epoch;
+	int ret;
+	int i;
+
+	ceph_decode_32_safe(p, end, blob_len, e_inval);
+	ceph_decode_need(p, end, blob_len, e_inval);
+
+	ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len);
+	if (ret)
+		goto fail;
+
+	dout("%s struct_v %d\n", __func__, struct_v);
+	ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval);
+	ceph_decode_32_safe(p, end, epoch, e_inval);
+	if (struct_v >= 6) {
+		u32 feat_struct_len;
+		u8 feat_struct_v;
 
-	ceph_decode_32_safe(&p, end, len, bad);
-	ceph_decode_need(&p, end, len, bad);
+		*p += sizeof(struct ceph_timespec);  /* skip last_changed */
+		*p += sizeof(struct ceph_timespec);  /* skip created */
 
-	dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
-	p += sizeof(u16);  /* skip version */
+		ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+					  &feat_struct_v, &feat_struct_len);
+		if (ret)
+			goto fail;
 
-	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
-	ceph_decode_copy(&p, &fsid, sizeof(fsid));
-	epoch = ceph_decode_32(&p);
+		*p += feat_struct_len;  /* skip persistent_features */
 
-	num_mon = ceph_decode_32(&p);
+		ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+					  &feat_struct_v, &feat_struct_len);
+		if (ret)
+			goto fail;
 
+		*p += feat_struct_len;  /* skip optional_features */
+	}
+	ceph_decode_32_safe(p, end, num_mon, e_inval);
+
+	dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch,
+	     num_mon);
 	if (num_mon > CEPH_MAX_MON)
-		goto bad;
-	m = kmalloc(struct_size(m, mon_inst, num_mon), GFP_NOFS);
-	if (m == NULL)
-		return ERR_PTR(-ENOMEM);
-	m->fsid = fsid;
-	m->epoch = epoch;
-	m->num_mon = num_mon;
-	for (i = 0; i < num_mon; ++i) {
-		struct ceph_entity_inst *inst = &m->mon_inst[i];
-
-		/* copy name portion */
-		ceph_decode_copy_safe(&p, end, &inst->name,
-					sizeof(inst->name), bad);
-		err = ceph_decode_entity_addr(&p, end, &inst->addr);
-		if (err)
-			goto bad;
+		goto e_inval;
+
+	monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO);
+	if (!monmap) {
+		ret = -ENOMEM;
+		goto fail;
 	}
-	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
-	     m->num_mon);
-	for (i = 0; i < m->num_mon; i++)
-		dout("monmap_decode  mon%d is %s\n", i,
-		     ceph_pr_addr(&m->mon_inst[i].addr));
-	return m;
-bad:
-	dout("monmap_decode failed with %d\n", err);
-	kfree(m);
-	return ERR_PTR(err);
+	monmap->fsid = fsid;
+	monmap->epoch = epoch;
+	monmap->num_mon = num_mon;
+
+	/* legacy_mon_addr map or mon_info map */
+	for (i = 0; i < num_mon; i++) {
+		struct ceph_entity_inst *inst = &monmap->mon_inst[i];
+
+		ceph_decode_skip_string(p, end, e_inval);  /* skip mon name */
+		inst->name.type = CEPH_ENTITY_TYPE_MON;
+		inst->name.num = cpu_to_le64(i);
+
+		if (struct_v >= 6)
+			ret = decode_mon_info(p, end, msgr2, &inst->addr);
+		else
+			ret = ceph_decode_entity_addr(p, end, &inst->addr);
+		if (ret)
+			goto fail;
+
+		dout("%s mon%d addr %s\n", __func__, i,
+		     ceph_pr_addr(&inst->addr));
+	}
+
+	return monmap;
+
+e_inval:
+	ret = -EINVAL;
+fail:
+	kfree(monmap);
+	return ERR_PTR(ret);
 }
 
 /*
@@ -476,7 +541,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	p = msg->front.iov_base;
 	end = p + msg->front.iov_len;
 
-	monmap = ceph_monmap_decode(p, end);
+	monmap = ceph_monmap_decode(&p, end, false);
 	if (IS_ERR(monmap)) {
 		pr_err("problem decoding monmap, %d\n",
 		       (int)PTR_ERR(monmap));
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 8966eae543d3..51be5a7482fc 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3918,9 +3918,9 @@ static int handle_one_map(struct ceph_osd_client *osdc,
 	set_pool_was_full(osdc);
 
 	if (incremental)
-		newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+		newmap = osdmap_apply_incremental(&p, end, false, osdc->osdmap);
 	else
-		newmap = ceph_osdmap_decode(&p, end);
+		newmap = ceph_osdmap_decode(&p, end, false);
 	if (IS_ERR(newmap))
 		return PTR_ERR(newmap);
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index fa08c15be0c0..2b1dd252f231 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1647,7 +1647,8 @@ static int decode_old_pg_upmap_items(void **p, void *end,
 /*
  * decode a full map.
  */
-static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
+static int osdmap_decode(void **p, void *end, bool msgr2,
+			 struct ceph_osdmap *map)
 {
 	u8 struct_v;
 	u32 epoch = 0;
@@ -1718,9 +1719,16 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
 		goto e_inval;
 
 	for (i = 0; i < map->max_osd; i++) {
-		err = ceph_decode_entity_addr(p, end, &map->osd_addr[i]);
+		struct ceph_entity_addr *addr = &map->osd_addr[i];
+
+		if (struct_v >= 8)
+			err = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+		else
+			err = ceph_decode_entity_addr(p, end, addr);
 		if (err)
 			goto bad;
+
+		dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr));
 	}
 
 	/* pg_temp */
@@ -1790,7 +1798,7 @@ bad:
 /*
  * Allocate and decode a full map.
  */
-struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2)
 {
 	struct ceph_osdmap *map;
 	int ret;
@@ -1799,7 +1807,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
 	if (!map)
 		return ERR_PTR(-ENOMEM);
 
-	ret = osdmap_decode(p, end, map);
+	ret = osdmap_decode(p, end, msgr2, map);
 	if (ret) {
 		ceph_osdmap_destroy(map);
 		return ERR_PTR(ret);
@@ -1817,12 +1825,13 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
  *     new_state: { osd=6, xorstate=EXISTS } # clear osd_state
  */
 static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
-				      struct ceph_osdmap *map)
+				      bool msgr2, struct ceph_osdmap *map)
 {
 	void *new_up_client;
 	void *new_state;
 	void *new_weight_end;
 	u32 len;
+	int ret;
 	int i;
 
 	new_up_client = *p;
@@ -1831,8 +1840,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 		struct ceph_entity_addr addr;
 
 		ceph_decode_skip_32(p, end, e_inval);
-		if (ceph_decode_entity_addr(p, end, &addr))
-			goto e_inval;
+		if (struct_v >= 7)
+			ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+		else
+			ret = ceph_decode_entity_addr(p, end, &addr);
+		if (ret)
+			return ret;
 	}
 
 	new_state = *p;
@@ -1874,7 +1887,6 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 	while (len--) {
 		s32 osd;
 		u32 xorstate;
-		int ret;
 
 		osd = ceph_decode_32(p);
 		if (struct_v >= 5)
@@ -1910,8 +1922,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
 
 		osd = ceph_decode_32(p);
 		BUG_ON(osd >= map->max_osd);
-		if (ceph_decode_entity_addr(p, end, &addr))
-			goto e_inval;
+		if (struct_v >= 7)
+			ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+		else
+			ret = ceph_decode_entity_addr(p, end, &addr);
+		if (ret)
+			return ret;
+
+		dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr));
+
 		pr_info("osd%d up\n", osd);
 		map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
 		map->osd_addr[osd] = addr;
@@ -1927,7 +1946,7 @@ e_inval:
 /*
  * decode and apply an incremental map update.
  */
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
 					     struct ceph_osdmap *map)
 {
 	struct ceph_fsid fsid;
@@ -1962,7 +1981,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	if (len > 0) {
 		dout("apply_incremental full map len %d, %p to %p\n",
 		     len, *p, end);
-		return ceph_osdmap_decode(p, min(*p+len, end));
+		return ceph_osdmap_decode(p, min(*p+len, end), msgr2);
 	}
 
 	/* new crush? */
@@ -2014,7 +2033,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 	}
 
 	/* new_up_client, new_state, new_weight */
-	err = decode_new_up_state_weight(p, end, struct_v, map);
+	err = decode_new_up_state_weight(p, end, struct_v, msgr2, map);
 	if (err)
 		goto bad;
 
-- 
cgit v1.2.3


From 313771e80fd253d4b5472e61a2d12b03c5293aa9 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 25 Nov 2020 14:41:59 +0100
Subject: libceph, rbd: ignore addr->type while comparing in some cases

For libceph, this ensures that libceph instance sharing (share option)
continues to work.  For rbd, this avoids blocklisting alive lock owners
(locker addr is always LEGACY, while watcher addr is ANY in nautilus).

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c       | 8 ++++++--
 include/linux/ceph/msgr.h | 9 ++++++++-
 net/ceph/mon_client.c     | 6 ++++--
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f84128abade3..bec85c054522 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3957,8 +3957,12 @@ static int find_watcher(struct rbd_device *rbd_dev,
 
 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
 	for (i = 0; i < num_watchers; i++) {
-		if (!memcmp(&watchers[i].addr, &locker->info.addr,
-			    sizeof(locker->info.addr)) &&
+		/*
+		 * Ignore addr->type while comparing.  This mimics
+		 * entity_addr_t::get_legacy_str() + strcmp().
+		 */
+		if (ceph_addr_equal_no_type(&watchers[i].addr,
+					    &locker->info.addr) &&
 		    watchers[i].cookie == cookie) {
 			struct rbd_client_id cid = {
 				.gid = le64_to_cpu(watchers[i].name.num),
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 46939485f2c3..9a897a60f20b 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -52,11 +52,18 @@ extern const char *ceph_entity_type_name(int type);
  * entity_addr -- network address
  */
 struct ceph_entity_addr {
-	__le32 type;
+	__le32 type;  /* CEPH_ENTITY_ADDR_TYPE_* */
 	__le32 nonce;  /* unique id for process (e.g. pid) */
 	struct sockaddr_storage in_addr;
 } __attribute__ ((packed));
 
+static inline bool ceph_addr_equal_no_type(const struct ceph_entity_addr *lhs,
+					   const struct ceph_entity_addr *rhs)
+{
+	return !memcmp(&lhs->in_addr, &rhs->in_addr, sizeof(lhs->in_addr)) &&
+	       lhs->nonce == rhs->nonce;
+}
+
 struct ceph_entity_inst {
 	struct ceph_entity_name name;
 	struct ceph_entity_addr addr;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index a9754a7fa78c..f5f090b4e409 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -161,9 +161,11 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
 {
 	int i;
 
-	for (i = 0; i < m->num_mon; i++)
-		if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+	for (i = 0; i < m->num_mon; i++) {
+		if (ceph_addr_equal_no_type(addr, &m->mon_inst[i].addr))
 			return 1;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 00498b994113a871a556f7ff24a4cf8a00611700 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 19 Nov 2020 16:04:58 +0100
Subject: libceph: introduce connection modes and ms_mode option

msgr2 supports two connection modes: crc (plain) and secure (on-wire
encryption).  Connection mode is picked by server based on input from
client.

Introduce ms_mode option:

  ms_mode=legacy        - msgr1 (default)
  ms_mode=crc           - crc mode, if denied fail
  ms_mode=secure        - secure mode, if denied fail
  ms_mode=prefer-crc    - crc mode, if denied agree to secure mode
  ms_mode=prefer-secure - secure mode, if denied agree to crc mode

ms_mode affects all connections, we don't separate connections to mons
like it's done in userspace with ms_client_mode vs ms_mon_client_mode.

For now the default is legacy, to be flipped to prefer-crc after some
time.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/auth.h    |  8 ++++--
 include/linux/ceph/ceph_fs.h |  6 +++++
 include/linux/ceph/libceph.h |  1 +
 net/ceph/auth.c              | 12 ++++++---
 net/ceph/ceph_common.c       | 63 ++++++++++++++++++++++++++++++++++++++++++++
 net/ceph/ceph_strings.c      | 14 ++++++++++
 net/ceph/mon_client.c        |  4 +--
 7 files changed, 100 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 5f64f66309fa..6fc058fe9efa 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -98,11 +98,15 @@ struct ceph_auth_client {
 	const struct ceph_crypto_key *key;     /* our secret key */
 	unsigned want_keys;     /* which services we want */
 
+	int preferred_mode;	/* CEPH_CON_MODE_* */
+	int fallback_mode;	/* ditto */
+
 	struct mutex mutex;
 };
 
-extern struct ceph_auth_client *ceph_auth_init(const char *name,
-					       const struct ceph_crypto_key *key);
+struct ceph_auth_client *ceph_auth_init(const char *name,
+					const struct ceph_crypto_key *key,
+					const int *con_modes);
 extern void ceph_auth_destroy(struct ceph_auth_client *ac);
 
 extern void ceph_auth_reset(struct ceph_auth_client *ac);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 6d986e52000b..ce22d5469670 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -93,9 +93,15 @@ struct ceph_dir_layout {
 #define CEPH_AUTH_NONE	 	0x1
 #define CEPH_AUTH_CEPHX	 	0x2
 
+/* msgr2 protocol modes */
+#define CEPH_CON_MODE_UNKNOWN	0x0
+#define CEPH_CON_MODE_CRC	0x1
+#define CEPH_CON_MODE_SECURE	0x2
+
 #define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
 
 const char *ceph_auth_proto_name(int proto);
+const char *ceph_con_mode_name(int mode);
 
 /*********************************************
  * message layer
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index eb5a7ca13f9c..8765a5ad267a 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -53,6 +53,7 @@ struct ceph_options {
 	unsigned long osd_keepalive_timeout;	/* jiffies */
 	unsigned long osd_request_timeout;	/* jiffies */
 	u32 read_from_replica;  /* CEPH_OSD_FLAG_BALANCE/LOCALIZE_READS */
+	int con_modes[2];  /* CEPH_CON_MODE_* */
 
 	/*
 	 * any type that can't be simply compared or doesn't need
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index deaf267f8942..4a0f32b32cc6 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -39,13 +39,13 @@ static int init_protocol(struct ceph_auth_client *ac, int proto)
 /*
  * setup, teardown.
  */
-struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key)
+struct ceph_auth_client *ceph_auth_init(const char *name,
+					const struct ceph_crypto_key *key,
+					const int *con_modes)
 {
 	struct ceph_auth_client *ac;
 	int ret;
 
-	dout("auth_init name '%s'\n", name);
-
 	ret = -ENOMEM;
 	ac = kzalloc(sizeof(*ac), GFP_NOFS);
 	if (!ac)
@@ -57,8 +57,12 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp
 		ac->name = name;
 	else
 		ac->name = CEPH_AUTH_NAME_DEFAULT;
-	dout("auth_init name %s\n", ac->name);
 	ac->key = key;
+	ac->preferred_mode = con_modes[0];
+	ac->fallback_mode = con_modes[1];
+
+	dout("%s name '%s' preferred_mode %d fallback_mode %d\n", __func__,
+	     ac->name, ac->preferred_mode, ac->fallback_mode);
 	return ac;
 
 out:
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4e7edd707a14..271287c5ec12 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -265,6 +265,7 @@ enum {
 	Opt_ip,
 	Opt_crush_location,
 	Opt_read_from_replica,
+	Opt_ms_mode,
 	/* string args above */
 	Opt_share,
 	Opt_crc,
@@ -287,6 +288,23 @@ static const struct constant_table ceph_param_read_from_replica[] = {
 	{}
 };
 
+enum ceph_ms_mode {
+	Opt_ms_mode_legacy,
+	Opt_ms_mode_crc,
+	Opt_ms_mode_secure,
+	Opt_ms_mode_prefer_crc,
+	Opt_ms_mode_prefer_secure
+};
+
+static const struct constant_table ceph_param_ms_mode[] = {
+	{"legacy",		Opt_ms_mode_legacy},
+	{"crc",			Opt_ms_mode_crc},
+	{"secure",		Opt_ms_mode_secure},
+	{"prefer-crc",		Opt_ms_mode_prefer_crc},
+	{"prefer-secure",	Opt_ms_mode_prefer_secure},
+	{}
+};
+
 static const struct fs_parameter_spec ceph_parameters[] = {
 	fsparam_flag	("abort_on_full",		Opt_abort_on_full),
 	fsparam_flag_no ("cephx_require_signatures",	Opt_cephx_require_signatures),
@@ -305,6 +323,8 @@ static const struct fs_parameter_spec ceph_parameters[] = {
 			 fs_param_deprecated, NULL),
 	fsparam_enum	("read_from_replica",		Opt_read_from_replica,
 			 ceph_param_read_from_replica),
+	fsparam_enum	("ms_mode",			Opt_ms_mode,
+			 ceph_param_ms_mode),
 	fsparam_string	("secret",			Opt_secret),
 	fsparam_flag_no ("share",			Opt_share),
 	fsparam_flag_no ("tcp_nodelay",			Opt_tcp_nodelay),
@@ -333,6 +353,8 @@ struct ceph_options *ceph_alloc_options(void)
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
 	opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
 	opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT;
+	opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+	opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
 	return opt;
 }
 EXPORT_SYMBOL(ceph_alloc_options);
@@ -503,6 +525,32 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 			BUG();
 		}
 		break;
+	case Opt_ms_mode:
+		switch (result.uint_32) {
+		case Opt_ms_mode_legacy:
+			opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+			opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+			break;
+		case Opt_ms_mode_crc:
+			opt->con_modes[0] = CEPH_CON_MODE_CRC;
+			opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+			break;
+		case Opt_ms_mode_secure:
+			opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+			opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+			break;
+		case Opt_ms_mode_prefer_crc:
+			opt->con_modes[0] = CEPH_CON_MODE_CRC;
+			opt->con_modes[1] = CEPH_CON_MODE_SECURE;
+			break;
+		case Opt_ms_mode_prefer_secure:
+			opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+			opt->con_modes[1] = CEPH_CON_MODE_CRC;
+			break;
+		default:
+			BUG();
+		}
+		break;
 
 	case Opt_osdtimeout:
 		warn_plog(&log, "Ignoring osdtimeout");
@@ -616,6 +664,21 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
 	} else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) {
 		seq_puts(m, "read_from_replica=localize,");
 	}
+	if (opt->con_modes[0] != CEPH_CON_MODE_UNKNOWN) {
+		if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+		    opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+			seq_puts(m, "ms_mode=crc,");
+		} else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+			   opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+			seq_puts(m, "ms_mode=secure,");
+		} else if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+			   opt->con_modes[1] == CEPH_CON_MODE_SECURE) {
+			seq_puts(m, "ms_mode=prefer-crc,");
+		} else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+			   opt->con_modes[1] == CEPH_CON_MODE_CRC) {
+			seq_puts(m, "ms_mode=prefer-secure,");
+		}
+	}
 
 	if (opt->flags & CEPH_OPT_FSID)
 		seq_printf(m, "fsid=%pU,", &opt->fsid);
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 69cd391e02a6..355fea272120 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -32,6 +32,20 @@ const char *ceph_auth_proto_name(int proto)
 	}
 }
 
+const char *ceph_con_mode_name(int mode)
+{
+	switch (mode) {
+	case CEPH_CON_MODE_UNKNOWN:
+		return "unknown";
+	case CEPH_CON_MODE_CRC:
+		return "crc";
+	case CEPH_CON_MODE_SECURE:
+		return "secure";
+	default:
+		return "???";
+	}
+}
+
 const char *ceph_osd_op_name(int op)
 {
 	switch (op) {
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index f5f090b4e409..792a8c4164d7 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1156,8 +1156,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
 
 	/* connection */
 	/* authentication */
-	monc->auth = ceph_auth_init(cl->options->name,
-				    cl->options->key);
+	monc->auth = ceph_auth_init(cl->options->name, cl->options->key,
+				    cl->options->con_modes);
 	if (IS_ERR(monc->auth)) {
 		err = PTR_ERR(monc->auth);
 		goto out_monmap;
-- 
cgit v1.2.3


From cd1a677cad994021b19665ed476aea63f5d54f31 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 19 Nov 2020 16:59:08 +0100
Subject: libceph, ceph: implement msgr2.1 protocol (crc and secure modes)

Implement msgr2.1 wire protocol, available since nautilus 14.2.11
and octopus 15.2.5.  msgr2.0 wire protocol is not implemented -- it
has several security, integrity and robustness issues and therefore
considered deprecated.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/mds_client.c           |   80 +-
 include/linux/ceph/auth.h      |   36 +-
 include/linux/ceph/ceph_fs.h   |    4 +
 include/linux/ceph/decode.h    |    4 +
 include/linux/ceph/libceph.h   |    9 +-
 include/linux/ceph/messenger.h |  136 +-
 include/linux/ceph/msgr.h      |   48 +
 net/ceph/Kconfig               |    3 +
 net/ceph/Makefile              |    2 +-
 net/ceph/auth.c                |  309 ++++
 net/ceph/decode.c              |   45 +
 net/ceph/messenger.c           |   68 +-
 net/ceph/messenger_v2.c        | 3443 ++++++++++++++++++++++++++++++++++++++++
 net/ceph/mon_client.c          |  115 +-
 net/ceph/osd_client.c          |   85 +-
 15 files changed, 4356 insertions(+), 31 deletions(-)
 create mode 100644 net/ceph/messenger_v2.c

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index afd22815fbda..740d63d0fc50 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -5014,7 +5014,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 		return;
 	}
 
-	newmap = ceph_mdsmap_decode(&p, end, false);
+	newmap = ceph_mdsmap_decode(&p, end, ceph_msgr2(mdsc->fsc->client));
 	if (IS_ERR(newmap)) {
 		err = PTR_ERR(newmap);
 		goto bad_unlock;
@@ -5196,6 +5196,80 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
 }
 
+static int mds_get_auth_request(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
+	int ret;
+
+	ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
+				       buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int mds_handle_auth_reply_more(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
+	int ret;
+
+	ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+					      buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int mds_handle_auth_done(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &s->s_auth;
+
+	return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+					       session_key, session_key_len,
+					       con_secret, con_secret_len);
+}
+
+static int mds_handle_auth_bad_method(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt)
+{
+	struct ceph_mds_session *s = con->private;
+	struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc;
+	int ret;
+
+	if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS,
+					    used_proto, result,
+					    allowed_protos, proto_cnt,
+					    allowed_modes, mode_cnt)) {
+		ret = ceph_monc_validate_auth(monc);
+		if (ret)
+			return ret;
+	}
+
+	return -EACCES;
+}
+
 static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
 				struct ceph_msg_header *hdr, int *skip)
 {
@@ -5245,6 +5319,10 @@ static const struct ceph_connection_operations mds_con_ops = {
 	.alloc_msg = mds_alloc_msg,
 	.sign_message = mds_sign_message,
 	.check_message_signature = mds_check_message_signature,
+	.get_auth_request = mds_get_auth_request,
+	.handle_auth_reply_more = mds_handle_auth_reply_more,
+	.handle_auth_done = mds_handle_auth_done,
+	.handle_auth_bad_method = mds_handle_auth_bad_method,
 };
 
 /* eof */
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 6fc058fe9efa..3fbe72ebd779 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -120,8 +120,12 @@ int ceph_auth_entity_name_encode(const char *name, void **p, void *end);
 
 extern int ceph_build_auth(struct ceph_auth_client *ac,
 		    void *msg_buf, size_t msg_len);
-
 extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			       struct ceph_auth_handshake *auth,
+			       int peer_type, bool force_new,
+			       int *proto, int *pref_mode, int *fallb_mode);
 extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
 				       int peer_type,
 				       struct ceph_auth_handshake *auth);
@@ -157,4 +161,34 @@ int ceph_auth_check_message_signature(struct ceph_auth_handshake *auth,
 		return auth->check_message_signature(auth, msg);
 	return 0;
 }
+
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len);
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+				int reply_len, void *buf, int buf_len);
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len);
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+				 int used_proto, int result,
+				 const int *allowed_protos, int proto_cnt,
+				 const int *allowed_modes, int mode_cnt);
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			     struct ceph_auth_handshake *auth,
+			     int peer_type, void *buf, int *buf_len);
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    void *buf, int *buf_len);
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    u8 *session_key, int *session_key_len,
+				    u8 *con_secret, int *con_secret_len);
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+				     int peer_type, int used_proto, int result,
+				     const int *allowed_protos, int proto_cnt,
+				     const int *allowed_modes, int mode_cnt);
+
 #endif
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index ce22d5469670..e41a811026f6 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -93,6 +93,10 @@ struct ceph_dir_layout {
 #define CEPH_AUTH_NONE	 	0x1
 #define CEPH_AUTH_CEPHX	 	0x2
 
+#define CEPH_AUTH_MODE_NONE		0
+#define CEPH_AUTH_MODE_AUTHORIZER	1
+#define CEPH_AUTH_MODE_MON		10
+
 /* msgr2 protocol modes */
 #define CEPH_CON_MODE_UNKNOWN	0x0
 #define CEPH_CON_MODE_CRC	0x1
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 9a934e04f841..04f3ace5787b 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -221,6 +221,7 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
 #define CEPH_ENTITY_ADDR_TYPE_NONE	0
 #define CEPH_ENTITY_ADDR_TYPE_LEGACY	__cpu_to_le32(1)
 #define CEPH_ENTITY_ADDR_TYPE_MSGR2	__cpu_to_le32(2)
+#define CEPH_ENTITY_ADDR_TYPE_ANY	__cpu_to_le32(3)
 
 static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
 {
@@ -243,6 +244,9 @@ extern int ceph_decode_entity_addr(void **p, void *end,
 int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
 			       struct ceph_entity_addr *addr);
 
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr);
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr);
+
 /*
  * encoders
  */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 8765a5ad267a..eb9008bb3992 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -31,10 +31,10 @@
 #define CEPH_OPT_FSID             (1<<0)
 #define CEPH_OPT_NOSHARE          (1<<1) /* don't share client with other sbs */
 #define CEPH_OPT_MYIP             (1<<2) /* specified my ip */
-#define CEPH_OPT_NOCRC            (1<<3) /* no data crc on writes */
+#define CEPH_OPT_NOCRC            (1<<3) /* no data crc on writes (msgr1) */
 #define CEPH_OPT_NOMSGAUTH	  (1<<4) /* don't require msg signing feat */
 #define CEPH_OPT_TCP_NODELAY	  (1<<5) /* TCP_NODELAY on TCP sockets */
-#define CEPH_OPT_NOMSGSIGN	  (1<<6) /* don't sign msgs */
+#define CEPH_OPT_NOMSGSIGN	  (1<<6) /* don't sign msgs (msgr1) */
 #define CEPH_OPT_ABORT_ON_FULL	  (1<<7) /* abort w/ ENOSPC when full */
 
 #define CEPH_OPT_DEFAULT   (CEPH_OPT_TCP_NODELAY)
@@ -84,6 +84,7 @@ struct ceph_options {
 #define CEPH_MONC_HUNT_BACKOFF		2
 #define CEPH_MONC_HUNT_MAX_MULT		10
 
+#define CEPH_MSG_MAX_CONTROL_LEN (16*1024*1024)
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
 
@@ -152,6 +153,10 @@ struct ceph_client {
 
 #define from_msgr(ms)	container_of(ms, struct ceph_client, msgr)
 
+static inline bool ceph_msgr2(struct ceph_client *client)
+{
+	return client->options->con_modes[0] != CEPH_CON_MODE_UNKNOWN;
+}
 
 /*
  * snapshots
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 54a64e8dfce6..0e6e9ad3c3bf 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -3,6 +3,7 @@
 #define __FS_CEPH_MESSENGER_H
 
 #include <linux/bvec.h>
+#include <linux/crypto.h>
 #include <linux/kref.h>
 #include <linux/mutex.h>
 #include <linux/net.h>
@@ -52,6 +53,23 @@ struct ceph_connection_operations {
 
 	int (*sign_message) (struct ceph_msg *msg);
 	int (*check_message_signature) (struct ceph_msg *msg);
+
+	/* msgr2 authentication exchange */
+	int (*get_auth_request)(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len);
+	int (*handle_auth_reply_more)(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len);
+	int (*handle_auth_done)(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len);
+	int (*handle_auth_bad_method)(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt);
 };
 
 /* use format string %s%lld */
@@ -246,8 +264,15 @@ struct ceph_msg {
 #define CEPH_CON_S_PREOPEN		2
 #define CEPH_CON_S_V1_BANNER		3
 #define CEPH_CON_S_V1_CONNECT_MSG	4
-#define CEPH_CON_S_OPEN			5
-#define CEPH_CON_S_STANDBY		6
+#define CEPH_CON_S_V2_BANNER_PREFIX	5
+#define CEPH_CON_S_V2_BANNER_PAYLOAD	6
+#define CEPH_CON_S_V2_HELLO		7
+#define CEPH_CON_S_V2_AUTH		8
+#define CEPH_CON_S_V2_AUTH_SIGNATURE	9
+#define CEPH_CON_S_V2_SESSION_CONNECT	10
+#define CEPH_CON_S_V2_SESSION_RECONNECT	11
+#define CEPH_CON_S_OPEN			12
+#define CEPH_CON_S_STANDBY		13
 
 /*
  * ceph_connection flag bits
@@ -301,6 +326,99 @@ struct ceph_connection_v1_info {
 	u32 peer_global_seq;  /* peer's global seq for this connection */
 };
 
+#define CEPH_CRC_LEN			4
+#define CEPH_GCM_KEY_LEN		16
+#define CEPH_GCM_IV_LEN			sizeof(struct ceph_gcm_nonce)
+#define CEPH_GCM_BLOCK_LEN		16
+#define CEPH_GCM_TAG_LEN		16
+
+#define CEPH_PREAMBLE_LEN		32
+#define CEPH_PREAMBLE_INLINE_LEN	48
+#define CEPH_PREAMBLE_PLAIN_LEN		CEPH_PREAMBLE_LEN
+#define CEPH_PREAMBLE_SECURE_LEN	(CEPH_PREAMBLE_LEN +		\
+					 CEPH_PREAMBLE_INLINE_LEN +	\
+					 CEPH_GCM_TAG_LEN)
+#define CEPH_EPILOGUE_PLAIN_LEN		(1 + 3 * CEPH_CRC_LEN)
+#define CEPH_EPILOGUE_SECURE_LEN	(CEPH_GCM_BLOCK_LEN + CEPH_GCM_TAG_LEN)
+
+#define CEPH_FRAME_MAX_SEGMENT_COUNT	4
+
+struct ceph_frame_desc {
+	int fd_tag;  /* FRAME_TAG_* */
+	int fd_seg_cnt;
+	int fd_lens[CEPH_FRAME_MAX_SEGMENT_COUNT];  /* logical */
+	int fd_aligns[CEPH_FRAME_MAX_SEGMENT_COUNT];
+};
+
+struct ceph_gcm_nonce {
+	__le32 fixed;
+	__le64 counter __packed;
+};
+
+struct ceph_connection_v2_info {
+	struct iov_iter in_iter;
+	struct kvec in_kvecs[5];  /* recvmsg */
+	struct bio_vec in_bvec;  /* recvmsg (in_cursor) */
+	int in_kvec_cnt;
+	int in_state;  /* IN_S_* */
+
+	struct iov_iter out_iter;
+	struct kvec out_kvecs[8];  /* sendmsg */
+	struct bio_vec out_bvec;  /* sendpage (out_cursor, out_zero),
+				     sendmsg (out_enc_pages) */
+	int out_kvec_cnt;
+	int out_state;  /* OUT_S_* */
+
+	int out_zero;  /* # of zero bytes to send */
+	bool out_iter_sendpage;  /* use sendpage if possible */
+
+	struct ceph_frame_desc in_desc;
+	struct ceph_msg_data_cursor in_cursor;
+	struct ceph_msg_data_cursor out_cursor;
+
+	struct crypto_shash *hmac_tfm;  /* post-auth signature */
+	struct crypto_aead *gcm_tfm;  /* on-wire encryption */
+	struct aead_request *gcm_req;
+	struct crypto_wait gcm_wait;
+	struct ceph_gcm_nonce in_gcm_nonce;
+	struct ceph_gcm_nonce out_gcm_nonce;
+
+	struct page **out_enc_pages;
+	int out_enc_page_cnt;
+	int out_enc_resid;
+	int out_enc_i;
+
+	int con_mode;  /* CEPH_CON_MODE_* */
+
+	void *conn_bufs[16];
+	int conn_buf_cnt;
+
+	struct kvec in_sign_kvecs[8];
+	struct kvec out_sign_kvecs[8];
+	int in_sign_kvec_cnt;
+	int out_sign_kvec_cnt;
+
+	u64 client_cookie;
+	u64 server_cookie;
+	u64 global_seq;
+	u64 connect_seq;
+	u64 peer_global_seq;
+
+	u8 in_buf[CEPH_PREAMBLE_SECURE_LEN];
+	u8 out_buf[CEPH_PREAMBLE_SECURE_LEN];
+	struct {
+		u8 late_status;  /* FRAME_LATE_STATUS_* */
+		union {
+			struct {
+				u32 front_crc;
+				u32 middle_crc;
+				u32 data_crc;
+			} __packed;
+			u8 pad[CEPH_GCM_BLOCK_LEN - 1];
+		};
+	} out_epil;
+};
+
 /*
  * A single connection with another host.
  *
@@ -346,7 +464,10 @@ struct ceph_connection {
 	struct delayed_work work;	    /* send|recv work */
 	unsigned long       delay;          /* current delay interval */
 
-	struct ceph_connection_v1_info v1;
+	union {
+		struct ceph_connection_v1_info v1;
+		struct ceph_connection_v2_info v2;
+	};
 };
 
 extern struct page *ceph_zero_page;
@@ -397,6 +518,15 @@ bool ceph_con_v1_opened(struct ceph_connection *con);
 void ceph_con_v1_reset_session(struct ceph_connection *con);
 void ceph_con_v1_reset_protocol(struct ceph_connection *con);
 
+/* messenger_v2.c */
+int ceph_con_v2_try_read(struct ceph_connection *con);
+int ceph_con_v2_try_write(struct ceph_connection *con);
+void ceph_con_v2_revoke(struct ceph_connection *con);
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con);
+bool ceph_con_v2_opened(struct ceph_connection *con);
+void ceph_con_v2_reset_session(struct ceph_connection *con);
+void ceph_con_v2_reset_protocol(struct ceph_connection *con);
+
 
 extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr);
 
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 9a897a60f20b..f5e02f6c0655 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -14,9 +14,39 @@
  * constant.
  */
 #define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_LEN 9
 #define CEPH_BANNER_MAX_LEN 30
 
 
+/*
+ * messenger V2 connection banner prefix.
+ * The full banner string should have the form: "ceph v2\n<le16>"
+ * the 2 bytes are the length of the remaining banner.
+ */
+#define CEPH_BANNER_V2 "ceph v2\n"
+#define CEPH_BANNER_V2_LEN 8
+#define CEPH_BANNER_V2_PREFIX_LEN (CEPH_BANNER_V2_LEN + sizeof(__le16))
+
+/*
+ * messenger V2 features
+ */
+#define CEPH_MSGR2_INCARNATION_1 (0ull)
+
+#define DEFINE_MSGR2_FEATURE(bit, incarnation, name)               \
+	static const uint64_t CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \
+	static const uint64_t CEPH_MSGR2_FEATUREMASK_##name =            \
+			(1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation);
+
+#define HAVE_MSGR2_FEATURE(x, name) \
+	(((x) & (CEPH_MSGR2_FEATUREMASK_##name)) == (CEPH_MSGR2_FEATUREMASK_##name))
+
+DEFINE_MSGR2_FEATURE( 0, 1, REVISION_1)   // msgr2.1
+
+#define CEPH_MSGR2_SUPPORTED_FEATURES (CEPH_MSGR2_FEATURE_REVISION_1)
+
+#define CEPH_MSGR2_REQUIRED_FEATURES  (CEPH_MSGR2_FEATURE_REVISION_1)
+
+
 /*
  * Rollover-safe type and comparator for 32-bit sequence numbers.
  * Comparator returns -1, 0, or 1.
@@ -158,6 +188,24 @@ struct ceph_msg_header {
 	__le32 crc;       /* header crc32c */
 } __attribute__ ((packed));
 
+struct ceph_msg_header2 {
+	__le64 seq;       /* message seq# for this session */
+	__le64 tid;       /* transaction id */
+	__le16 type;      /* message type */
+	__le16 priority;  /* priority.  higher value == higher priority */
+	__le16 version;   /* version of message encoding */
+
+	__le32 data_pre_padding_len;
+	__le16 data_off;  /* sender: include full offset;
+			     receiver: mask against ~PAGE_MASK */
+
+	__le64 ack_seq;
+	__u8 flags;
+	/* oldest code we think can decode this.  unknown if zero. */
+	__le16 compat_version;
+	__le16 reserved;
+} __attribute__ ((packed));
+
 #define CEPH_MSG_PRIO_LOW     64
 #define CEPH_MSG_PRIO_DEFAULT 127
 #define CEPH_MSG_PRIO_HIGH    196
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index f36f9a3a4e20..c5c4eef3a9ff 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -5,6 +5,9 @@ config CEPH_LIB
 	select LIBCRC32C
 	select CRYPTO_AES
 	select CRYPTO_CBC
+	select CRYPTO_GCM
+	select CRYPTO_HMAC
+	select CRYPTO_SHA256
 	select CRYPTO
 	select KEYS
 	default n
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index df02bd8d6c7b..8802a0c0155d 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -15,4 +15,4 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	auth_x.o \
 	ceph_strings.o ceph_hash.o \
 	pagevec.o snapshot.o string_table.o \
-	messenger_v1.o
+	messenger_v1.o messenger_v2.o
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 4a0f32b32cc6..6b315c8212b1 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -293,6 +293,39 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
 }
 EXPORT_SYMBOL(ceph_auth_is_authenticated);
 
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			       struct ceph_auth_handshake *auth,
+			       int peer_type, bool force_new,
+			       int *proto, int *pref_mode, int *fallb_mode)
+{
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	if (force_new && auth->authorizer) {
+		ceph_auth_destroy_authorizer(auth->authorizer);
+		auth->authorizer = NULL;
+	}
+	if (!auth->authorizer)
+		ret = ac->ops->create_authorizer(ac, peer_type, auth);
+	else if (ac->ops->update_authorizer)
+		ret = ac->ops->update_authorizer(ac, peer_type, auth);
+	else
+		ret = 0;
+	if (ret)
+		goto out;
+
+	*proto = ac->protocol;
+	if (pref_mode && fallb_mode) {
+		*pref_mode = ac->preferred_mode;
+		*fallb_mode = ac->fallback_mode;
+	}
+
+out:
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+EXPORT_SYMBOL(__ceph_auth_get_authorizer);
+
 int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
 				int peer_type,
 				struct ceph_auth_handshake *auth)
@@ -369,3 +402,279 @@ void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
 	mutex_unlock(&ac->mutex);
 }
 EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
+
+/*
+ * msgr2 authentication
+ */
+
+static bool contains(const int *arr, int cnt, int val)
+{
+	int i;
+
+	for (i = 0; i < cnt; i++) {
+		if (arr[i] == val)
+			return true;
+	}
+
+	return false;
+}
+
+static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode)
+{
+	WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN);
+	if (fallb_mode != CEPH_CON_MODE_UNKNOWN) {
+		ceph_encode_32_safe(p, end, 2, e_range);
+		ceph_encode_32_safe(p, end, pref_mode, e_range);
+		ceph_encode_32_safe(p, end, fallb_mode, e_range);
+	} else {
+		ceph_encode_32_safe(p, end, 1, e_range);
+		ceph_encode_32_safe(p, end, pref_mode, e_range);
+	}
+
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+
+/*
+ * Similar to ceph_auth_build_hello().
+ */
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len)
+{
+	int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE;
+	void *end = buf + buf_len;
+	void *lenp;
+	void *p;
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	if (ac->protocol == CEPH_AUTH_UNKNOWN) {
+		ret = init_protocol(ac, proto);
+		if (ret) {
+			pr_err("auth protocol '%s' init failed: %d\n",
+			       ceph_auth_proto_name(proto), ret);
+			goto out;
+		}
+	} else {
+		WARN_ON(ac->protocol != proto);
+		ac->ops->reset(ac);
+	}
+
+	p = buf;
+	ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+	ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode);
+	if (ret)
+		goto out;
+
+	lenp = p;
+	p += 4;  /* space for len */
+
+	ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range);
+	ret = ceph_auth_entity_name_encode(ac->name, &p, end);
+	if (ret)
+		goto out;
+
+	ceph_encode_64_safe(&p, end, ac->global_id, e_range);
+	ceph_encode_32(&lenp, p - lenp - 4);
+	ret = p - buf;
+
+out:
+	mutex_unlock(&ac->mutex);
+	return ret;
+
+e_range:
+	ret = -ERANGE;
+	goto out;
+}
+
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+				int reply_len, void *buf, int buf_len)
+{
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+				    NULL, NULL, NULL, NULL);
+	if (ret == -EAGAIN)
+		ret = build_request(ac, false, buf, buf_len);
+	else
+		WARN_ON(ret >= 0);
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	int ret;
+
+	mutex_lock(&ac->mutex);
+	if (global_id && ac->global_id != global_id) {
+		dout("%s global_id %llu -> %llu\n", __func__, ac->global_id,
+		     global_id);
+		ac->global_id = global_id;
+	}
+
+	ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+				    session_key, session_key_len,
+				    con_secret, con_secret_len);
+	mutex_unlock(&ac->mutex);
+	return ret;
+}
+
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+				 int used_proto, int result,
+				 const int *allowed_protos, int proto_cnt,
+				 const int *allowed_modes, int mode_cnt)
+{
+	mutex_lock(&ac->mutex);
+	WARN_ON(used_proto != ac->protocol);
+
+	if (result == -EOPNOTSUPP) {
+		if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+			pr_err("auth protocol '%s' not allowed\n",
+			       ceph_auth_proto_name(ac->protocol));
+			goto not_allowed;
+		}
+		if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+		    (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+		     !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+			pr_err("preferred mode '%s' not allowed\n",
+			       ceph_con_mode_name(ac->preferred_mode));
+			if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+				pr_err("no fallback mode\n");
+			else
+				pr_err("fallback mode '%s' not allowed\n",
+				       ceph_con_mode_name(ac->fallback_mode));
+			goto not_allowed;
+		}
+	}
+
+	WARN_ON(result == -EOPNOTSUPP || result >= 0);
+	pr_err("auth protocol '%s' msgr authentication failed: %d\n",
+	       ceph_auth_proto_name(ac->protocol), result);
+
+	mutex_unlock(&ac->mutex);
+	return true;
+
+not_allowed:
+	mutex_unlock(&ac->mutex);
+	return false;
+}
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+			     struct ceph_auth_handshake *auth,
+			     int peer_type, void *buf, int *buf_len)
+{
+	void *end = buf + *buf_len;
+	int pref_mode, fallb_mode;
+	int proto;
+	void *p;
+	int ret;
+
+	ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto,
+					 &pref_mode, &fallb_mode);
+	if (ret)
+		return ret;
+
+	p = buf;
+	ceph_encode_32_safe(&p, end, proto, e_range);
+	ret = encode_con_modes(&p, end, pref_mode, fallb_mode);
+	if (ret)
+		return ret;
+
+	ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+	*buf_len = p - buf;
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_get_authorizer);
+
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    void *buf, int *buf_len)
+{
+	void *end = buf + *buf_len;
+	void *p;
+	int ret;
+
+	ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer,
+						 reply, reply_len);
+	if (ret)
+		return ret;
+
+	p = buf;
+	ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+	*buf_len = p - buf;
+	return 0;
+
+e_range:
+	return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more);
+
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+				    struct ceph_auth_handshake *auth,
+				    void *reply, int reply_len,
+				    u8 *session_key, int *session_key_len,
+				    u8 *con_secret, int *con_secret_len)
+{
+	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+		reply, reply_len, session_key, session_key_len,
+		con_secret, con_secret_len);
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done);
+
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+				     int peer_type, int used_proto, int result,
+				     const int *allowed_protos, int proto_cnt,
+				     const int *allowed_modes, int mode_cnt)
+{
+	mutex_lock(&ac->mutex);
+	WARN_ON(used_proto != ac->protocol);
+
+	if (result == -EOPNOTSUPP) {
+		if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+			pr_err("auth protocol '%s' not allowed by %s\n",
+			       ceph_auth_proto_name(ac->protocol),
+			       ceph_entity_type_name(peer_type));
+			goto not_allowed;
+		}
+		if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+		    (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+		     !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+			pr_err("preferred mode '%s' not allowed by %s\n",
+			       ceph_con_mode_name(ac->preferred_mode),
+			       ceph_entity_type_name(peer_type));
+			if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+				pr_err("no fallback mode\n");
+			else
+				pr_err("fallback mode '%s' not allowed by %s\n",
+				       ceph_con_mode_name(ac->fallback_mode),
+				       ceph_entity_type_name(peer_type));
+			goto not_allowed;
+		}
+	}
+
+	WARN_ON(result == -EOPNOTSUPP || result >= 0);
+	pr_err("auth protocol '%s' authorization to %s failed: %d\n",
+	       ceph_auth_proto_name(ac->protocol),
+	       ceph_entity_type_name(peer_type), result);
+
+	if (ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, peer_type);
+
+	mutex_unlock(&ac->mutex);
+	return true;
+
+not_allowed:
+	mutex_unlock(&ac->mutex);
+	return false;
+}
+EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer);
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index 6429b6713507..b44f7651be04 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/ceph/ceph_debug.h>
 
+#include <linux/inet.h>
+
 #include <linux/ceph/decode.h>
 
 static int
@@ -138,3 +140,46 @@ e_inval:
 	return -EINVAL;
 }
 EXPORT_SYMBOL(ceph_decode_entity_addrvec);
+
+static int get_sockaddr_encoding_len(sa_family_t family)
+{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in sin;
+		struct sockaddr_in6 sin6;
+	} u;
+
+	switch (family) {
+	case AF_INET:
+		return sizeof(u.sin);
+	case AF_INET6:
+		return sizeof(u.sin6);
+	default:
+		return sizeof(u);
+	}
+}
+
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr)
+{
+	sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+	int addr_len = get_sockaddr_encoding_len(family);
+
+	return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len;
+}
+
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr)
+{
+	sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+	int addr_len = get_sockaddr_encoding_len(family);
+
+	ceph_encode_8(p, 1);  /* marker */
+	ceph_start_encoding(p, 1, 1, sizeof(addr->type) +
+				     sizeof(addr->nonce) +
+				     sizeof(u32) + addr_len);
+	ceph_encode_copy(p, &addr->type, sizeof(addr->type));
+	ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce));
+
+	ceph_encode_32(p, addr_len);
+	ceph_encode_16(p, family);
+	ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family));
+}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4fb3c33a7b03..57d043b382ed 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -195,8 +195,11 @@ EXPORT_SYMBOL(ceph_pr_addr);
 
 void ceph_encode_my_addr(struct ceph_messenger *msgr)
 {
-	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
-	ceph_encode_banner_addr(&msgr->my_enc_addr);
+	if (!ceph_msgr2(from_msgr(msgr))) {
+		memcpy(&msgr->my_enc_addr, &msgr->inst.addr,
+		       sizeof(msgr->my_enc_addr));
+		ceph_encode_banner_addr(&msgr->my_enc_addr);
+	}
 }
 
 /*
@@ -513,7 +516,10 @@ static void ceph_con_reset_protocol(struct ceph_connection *con)
 		con->out_msg = NULL;
 	}
 
-	ceph_con_v1_reset_protocol(con);
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		ceph_con_v2_reset_protocol(con);
+	else
+		ceph_con_v1_reset_protocol(con);
 }
 
 /*
@@ -526,6 +532,7 @@ static void ceph_msg_remove(struct ceph_msg *msg)
 
 	ceph_msg_put(msg);
 }
+
 static void ceph_msg_remove_list(struct list_head *head)
 {
 	while (!list_empty(head)) {
@@ -547,7 +554,10 @@ void ceph_con_reset_session(struct ceph_connection *con)
 	con->in_seq = 0;
 	con->in_seq_acked = 0;
 
-	ceph_con_v1_reset_session(con);
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		ceph_con_v2_reset_session(con);
+	else
+		ceph_con_v1_reset_session(con);
 }
 
 /*
@@ -600,6 +610,9 @@ EXPORT_SYMBOL(ceph_con_open);
  */
 bool ceph_con_opened(struct ceph_connection *con)
 {
+	if (ceph_msgr2(from_msgr(con->msgr)))
+		return ceph_con_v2_opened(con);
+
 	return ceph_con_v1_opened(con);
 }
 
@@ -1302,7 +1315,16 @@ int ceph_parse_ips(const char *c, const char *end,
 		}
 
 		ceph_addr_set_port(&addr[i], port);
+		/*
+		 * We want the type to be set according to ms_mode
+		 * option, but options are normally parsed after mon
+		 * addresses.  Rather than complicating parsing, set
+		 * to LEGACY and override in build_initial_monmap()
+		 * for mon addresses and ceph_messenger_init() for
+		 * ip option.
+		 */
 		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+		addr[i].nonce = 0;
 
 		dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
 
@@ -1410,6 +1432,13 @@ static bool con_sock_closed(struct ceph_connection *con)
 	CASE(PREOPEN);
 	CASE(V1_BANNER);
 	CASE(V1_CONNECT_MSG);
+	CASE(V2_BANNER_PREFIX);
+	CASE(V2_BANNER_PAYLOAD);
+	CASE(V2_HELLO);
+	CASE(V2_AUTH);
+	CASE(V2_AUTH_SIGNATURE);
+	CASE(V2_SESSION_CONNECT);
+	CASE(V2_SESSION_RECONNECT);
 	CASE(OPEN);
 	CASE(STANDBY);
 	default:
@@ -1494,7 +1523,10 @@ static void ceph_con_workfn(struct work_struct *work)
 			BUG_ON(con->sock);
 		}
 
-		ret = ceph_con_v1_try_read(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ret = ceph_con_v2_try_read(con);
+		else
+			ret = ceph_con_v1_try_read(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -1504,7 +1536,10 @@ static void ceph_con_workfn(struct work_struct *work)
 			break;
 		}
 
-		ret = ceph_con_v1_try_write(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ret = ceph_con_v2_try_write(con);
+		else
+			ret = ceph_con_v1_try_write(con);
 		if (ret < 0) {
 			if (ret == -EAGAIN)
 				continue;
@@ -1538,9 +1573,8 @@ static void con_fault(struct ceph_connection *con)
 		ceph_pr_addr(&con->peer_addr), con->error_msg);
 	con->error_msg = NULL;
 
-	WARN_ON(con->state != CEPH_CON_S_V1_BANNER &&
-	       con->state != CEPH_CON_S_V1_CONNECT_MSG &&
-	       con->state != CEPH_CON_S_OPEN);
+	WARN_ON(con->state == CEPH_CON_S_STANDBY ||
+		con->state == CEPH_CON_S_CLOSED);
 
 	ceph_con_reset_protocol(con);
 
@@ -1596,7 +1630,11 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 		ceph_addr_set_port(&msgr->inst.addr, 0);
 	}
 
-	msgr->inst.addr.type = 0;
+	/*
+	 * Since nautilus, clients are identified using type ANY.
+	 * For msgr1, ceph_encode_banner_addr() munges it to NONE.
+	 */
+	msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY;
 
 	/* generate a random non-zero nonce */
 	do {
@@ -1706,7 +1744,10 @@ void ceph_msg_revoke(struct ceph_msg *msg)
 	if (con->out_msg == msg) {
 		WARN_ON(con->state != CEPH_CON_S_OPEN);
 		dout("%s con %p msg %p was sending\n", __func__, con, msg);
-		ceph_con_v1_revoke(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ceph_con_v2_revoke(con);
+		else
+			ceph_con_v1_revoke(con);
 		ceph_msg_put(con->out_msg);
 		con->out_msg = NULL;
 	} else {
@@ -1732,7 +1773,10 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
 	if (con->in_msg == msg) {
 		WARN_ON(con->state != CEPH_CON_S_OPEN);
 		dout("%s con %p msg %p was recving\n", __func__, con, msg);
-		ceph_con_v1_revoke_incoming(con);
+		if (ceph_msgr2(from_msgr(con->msgr)))
+			ceph_con_v2_revoke_incoming(con);
+		else
+			ceph_con_v1_revoke_incoming(con);
 		ceph_msg_put(con->in_msg);
 		con->in_msg = NULL;
 	} else {
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
new file mode 100644
index 000000000000..5e38c847317b
--- /dev/null
+++ b/net/ceph/messenger_v2.c
@@ -0,0 +1,3443 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ceph msgr2 protocol implementation
+ *
+ * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com>
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <crypto/aead.h>
+#include <crypto/algapi.h>  /* for crypto_memneq() */
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/socket.h>
+#include <linux/sched/mm.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+#include "crypto.h"  /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */
+
+#define FRAME_TAG_HELLO			1
+#define FRAME_TAG_AUTH_REQUEST		2
+#define FRAME_TAG_AUTH_BAD_METHOD	3
+#define FRAME_TAG_AUTH_REPLY_MORE	4
+#define FRAME_TAG_AUTH_REQUEST_MORE	5
+#define FRAME_TAG_AUTH_DONE		6
+#define FRAME_TAG_AUTH_SIGNATURE	7
+#define FRAME_TAG_CLIENT_IDENT		8
+#define FRAME_TAG_SERVER_IDENT		9
+#define FRAME_TAG_IDENT_MISSING_FEATURES 10
+#define FRAME_TAG_SESSION_RECONNECT	11
+#define FRAME_TAG_SESSION_RESET		12
+#define FRAME_TAG_SESSION_RETRY		13
+#define FRAME_TAG_SESSION_RETRY_GLOBAL	14
+#define FRAME_TAG_SESSION_RECONNECT_OK	15
+#define FRAME_TAG_WAIT			16
+#define FRAME_TAG_MESSAGE		17
+#define FRAME_TAG_KEEPALIVE2		18
+#define FRAME_TAG_KEEPALIVE2_ACK	19
+#define FRAME_TAG_ACK			20
+
+#define FRAME_LATE_STATUS_ABORTED	0x1
+#define FRAME_LATE_STATUS_COMPLETE	0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK	0xf
+
+#define IN_S_HANDLE_PREAMBLE		1
+#define IN_S_HANDLE_CONTROL		2
+#define IN_S_HANDLE_CONTROL_REMAINDER	3
+#define IN_S_PREPARE_READ_DATA		4
+#define IN_S_PREPARE_READ_DATA_CONT	5
+#define IN_S_HANDLE_EPILOGUE		6
+#define IN_S_FINISH_SKIP		7
+
+#define OUT_S_QUEUE_DATA		1
+#define OUT_S_QUEUE_DATA_CONT		2
+#define OUT_S_QUEUE_ENC_PAGE		3
+#define OUT_S_QUEUE_ZEROS		4
+#define OUT_S_FINISH_MESSAGE		5
+#define OUT_S_GET_NEXT			6
+
+#define CTRL_BODY(p)	((void *)(p) + CEPH_PREAMBLE_LEN)
+#define FRONT_PAD(p)	((void *)(p) + CEPH_EPILOGUE_SECURE_LEN)
+#define MIDDLE_PAD(p)	(FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN)
+#define DATA_PAD(p)	(MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN)
+
+#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static int do_recvmsg(struct socket *sock, struct iov_iter *it)
+{
+	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+	int ret;
+
+	msg.msg_iter = *it;
+	while (iov_iter_count(it)) {
+		ret = sock_recvmsg(sock, &msg, msg.msg_flags);
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				ret = 0;
+			return ret;
+		}
+
+		iov_iter_advance(it, ret);
+	}
+
+	WARN_ON(msg_data_left(&msg));
+	return 1;
+}
+
+/*
+ * Read as much as possible.
+ *
+ * Return:
+ *   1 - done, nothing (else) to read
+ *   0 - socket is empty, need to wait
+ *  <0 - error
+ */
+static int ceph_tcp_recv(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p %s %zu\n", __func__, con,
+	     iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need",
+	     iov_iter_count(&con->v2.in_iter));
+	ret = do_recvmsg(con->sock, &con->v2.in_iter);
+	dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+	     iov_iter_count(&con->v2.in_iter));
+	return ret;
+}
+
+static int do_sendmsg(struct socket *sock, struct iov_iter *it)
+{
+	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+	int ret;
+
+	msg.msg_iter = *it;
+	while (iov_iter_count(it)) {
+		ret = sock_sendmsg(sock, &msg);
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				ret = 0;
+			return ret;
+		}
+
+		iov_iter_advance(it, ret);
+	}
+
+	WARN_ON(msg_data_left(&msg));
+	return 1;
+}
+
+static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
+{
+	struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+	struct bio_vec bv;
+	int ret;
+
+	if (WARN_ON(!iov_iter_is_bvec(it)))
+		return -EINVAL;
+
+	while (iov_iter_count(it)) {
+		/* iov_iter_iovec() for ITER_BVEC */
+		bv.bv_page = it->bvec->bv_page;
+		bv.bv_offset = it->bvec->bv_offset + it->iov_offset;
+		bv.bv_len = min(iov_iter_count(it),
+				it->bvec->bv_len - it->iov_offset);
+
+		/*
+		 * sendpage cannot properly handle pages with
+		 * page_count == 0, we need to fall back to sendmsg if
+		 * that's the case.
+		 *
+		 * Same goes for slab pages: skb_can_coalesce() allows
+		 * coalescing neighboring slab objects into a single frag
+		 * which triggers one of hardened usercopy checks.
+		 */
+		if (sendpage_ok(bv.bv_page)) {
+			ret = sock->ops->sendpage(sock, bv.bv_page,
+						  bv.bv_offset, bv.bv_len,
+						  CEPH_MSG_FLAGS);
+		} else {
+			iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, bv.bv_len);
+			ret = sock_sendmsg(sock, &msg);
+		}
+		if (ret <= 0) {
+			if (ret == -EAGAIN)
+				ret = 0;
+			return ret;
+		}
+
+		iov_iter_advance(it, ret);
+	}
+
+	return 1;
+}
+
+/*
+ * Write as much as possible.  The socket is expected to be corked,
+ * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here.
+ *
+ * Return:
+ *   1 - done, nothing (else) to write
+ *   0 - socket is full, need to wait
+ *  <0 - error
+ */
+static int ceph_tcp_send(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
+	     iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
+	if (con->v2.out_iter_sendpage)
+		ret = do_try_sendpage(con->sock, &con->v2.out_iter);
+	else
+		ret = do_sendmsg(con->sock, &con->v2.out_iter);
+	dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+	     iov_iter_count(&con->v2.out_iter));
+	return ret;
+}
+
+static void add_in_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+
+	con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf;
+	con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len;
+	con->v2.in_kvec_cnt++;
+
+	con->v2.in_iter.nr_segs++;
+	con->v2.in_iter.count += len;
+}
+
+static void reset_in_kvecs(struct ceph_connection *con)
+{
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	con->v2.in_kvec_cnt = 0;
+	iov_iter_kvec(&con->v2.in_iter, READ, con->v2.in_kvecs, 0, 0);
+}
+
+static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv)
+{
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	con->v2.in_bvec = *bv;
+	iov_iter_bvec(&con->v2.in_iter, READ, &con->v2.in_bvec, 1, bv->bv_len);
+}
+
+static void set_in_skip(struct ceph_connection *con, int len)
+{
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	dout("%s con %p len %d\n", __func__, con, len);
+	iov_iter_discard(&con->v2.in_iter, READ, len);
+}
+
+static void add_out_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+	WARN_ON(con->v2.out_zero);
+
+	con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf;
+	con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len;
+	con->v2.out_kvec_cnt++;
+
+	con->v2.out_iter.nr_segs++;
+	con->v2.out_iter.count += len;
+}
+
+static void reset_out_kvecs(struct ceph_connection *con)
+{
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	WARN_ON(con->v2.out_zero);
+
+	con->v2.out_kvec_cnt = 0;
+
+	iov_iter_kvec(&con->v2.out_iter, WRITE, con->v2.out_kvecs, 0, 0);
+	con->v2.out_iter_sendpage = false;
+}
+
+static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv,
+			 bool zerocopy)
+{
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	WARN_ON(con->v2.out_zero);
+
+	con->v2.out_bvec = *bv;
+	con->v2.out_iter_sendpage = zerocopy;
+	iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+		      con->v2.out_bvec.bv_len);
+}
+
+static void set_out_bvec_zero(struct ceph_connection *con)
+{
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	WARN_ON(!con->v2.out_zero);
+
+	con->v2.out_bvec.bv_page = ceph_zero_page;
+	con->v2.out_bvec.bv_offset = 0;
+	con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE);
+	con->v2.out_iter_sendpage = true;
+	iov_iter_bvec(&con->v2.out_iter, WRITE, &con->v2.out_bvec, 1,
+		      con->v2.out_bvec.bv_len);
+}
+
+static void out_zero_add(struct ceph_connection *con, int len)
+{
+	dout("%s con %p len %d\n", __func__, con, len);
+	con->v2.out_zero += len;
+}
+
+static void *alloc_conn_buf(struct ceph_connection *con, int len)
+{
+	void *buf;
+
+	dout("%s con %p len %d\n", __func__, con, len);
+
+	if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs)))
+		return NULL;
+
+	buf = ceph_kvmalloc(len, GFP_NOIO);
+	if (!buf)
+		return NULL;
+
+	con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf;
+	return buf;
+}
+
+static void free_conn_bufs(struct ceph_connection *con)
+{
+	while (con->v2.conn_buf_cnt)
+		kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]);
+}
+
+static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs));
+
+	con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf;
+	con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len;
+	con->v2.in_sign_kvec_cnt++;
+}
+
+static void clear_in_sign_kvecs(struct ceph_connection *con)
+{
+	con->v2.in_sign_kvec_cnt = 0;
+}
+
+static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+	BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs));
+
+	con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf;
+	con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len;
+	con->v2.out_sign_kvec_cnt++;
+}
+
+static void clear_out_sign_kvecs(struct ceph_connection *con)
+{
+	con->v2.out_sign_kvec_cnt = 0;
+}
+
+static bool con_secure(struct ceph_connection *con)
+{
+	return con->v2.con_mode == CEPH_CON_MODE_SECURE;
+}
+
+static int front_len(const struct ceph_msg *msg)
+{
+	return le32_to_cpu(msg->hdr.front_len);
+}
+
+static int middle_len(const struct ceph_msg *msg)
+{
+	return le32_to_cpu(msg->hdr.middle_len);
+}
+
+static int data_len(const struct ceph_msg *msg)
+{
+	return le32_to_cpu(msg->hdr.data_len);
+}
+
+static bool need_padding(int len)
+{
+	return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padded_len(int len)
+{
+	return ALIGN(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padding_len(int len)
+{
+	return padded_len(len) - len;
+}
+
+/* preamble + control segment */
+static int head_onwire_len(int ctrl_len, bool secure)
+{
+	int head_len;
+	int rem_len;
+
+	if (secure) {
+		head_len = CEPH_PREAMBLE_SECURE_LEN;
+		if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+			rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+			head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN;
+		}
+	} else {
+		head_len = CEPH_PREAMBLE_PLAIN_LEN;
+		if (ctrl_len)
+			head_len += ctrl_len + CEPH_CRC_LEN;
+	}
+	return head_len;
+}
+
+/* front, middle and data segments + epilogue */
+static int __tail_onwire_len(int front_len, int middle_len, int data_len,
+			     bool secure)
+{
+	if (!front_len && !middle_len && !data_len)
+		return 0;
+
+	if (!secure)
+		return front_len + middle_len + data_len +
+		       CEPH_EPILOGUE_PLAIN_LEN;
+
+	return padded_len(front_len) + padded_len(middle_len) +
+	       padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN;
+}
+
+static int tail_onwire_len(const struct ceph_msg *msg, bool secure)
+{
+	return __tail_onwire_len(front_len(msg), middle_len(msg),
+				 data_len(msg), secure);
+}
+
+/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */
+#define MESSAGE_HEAD_PLAIN_LEN	(CEPH_PREAMBLE_PLAIN_LEN +		\
+				 sizeof(struct ceph_msg_header2) +	\
+				 CEPH_CRC_LEN)
+
+static const int frame_aligns[] = {
+	sizeof(void *),
+	sizeof(void *),
+	sizeof(void *),
+	PAGE_SIZE
+};
+
+/*
+ * Discards trailing empty segments, unless there is just one segment.
+ * A frame always has at least one (possibly empty) segment.
+ */
+static int calc_segment_count(const int *lens, int len_cnt)
+{
+	int i;
+
+	for (i = len_cnt - 1; i >= 0; i--) {
+		if (lens[i])
+			return i + 1;
+	}
+
+	return 1;
+}
+
+static void init_frame_desc(struct ceph_frame_desc *desc, int tag,
+			    const int *lens, int len_cnt)
+{
+	int i;
+
+	memset(desc, 0, sizeof(*desc));
+
+	desc->fd_tag = tag;
+	desc->fd_seg_cnt = calc_segment_count(lens, len_cnt);
+	BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT);
+	for (i = 0; i < desc->fd_seg_cnt; i++) {
+		desc->fd_lens[i] = lens[i];
+		desc->fd_aligns[i] = frame_aligns[i];
+	}
+}
+
+/*
+ * Preamble crc covers everything up to itself (28 bytes) and
+ * is calculated and verified irrespective of the connection mode
+ * (i.e. even if the frame is encrypted).
+ */
+static void encode_preamble(const struct ceph_frame_desc *desc, void *p)
+{
+	void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+	void *start = p;
+	int i;
+
+	memset(p, 0, CEPH_PREAMBLE_LEN);
+
+	ceph_encode_8(&p, desc->fd_tag);
+	ceph_encode_8(&p, desc->fd_seg_cnt);
+	for (i = 0; i < desc->fd_seg_cnt; i++) {
+		ceph_encode_32(&p, desc->fd_lens[i]);
+		ceph_encode_16(&p, desc->fd_aligns[i]);
+	}
+
+	put_unaligned_le32(crc32c(0, start, crcp - start), crcp);
+}
+
+static int decode_preamble(void *p, struct ceph_frame_desc *desc)
+{
+	void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+	u32 crc, expected_crc;
+	int i;
+
+	crc = crc32c(0, p, crcp - p);
+	expected_crc = get_unaligned_le32(crcp);
+	if (crc != expected_crc) {
+		pr_err("bad preamble crc, calculated %u, expected %u\n",
+		       crc, expected_crc);
+		return -EBADMSG;
+	}
+
+	memset(desc, 0, sizeof(*desc));
+
+	desc->fd_tag = ceph_decode_8(&p);
+	desc->fd_seg_cnt = ceph_decode_8(&p);
+	if (desc->fd_seg_cnt < 1 ||
+	    desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) {
+		pr_err("bad segment count %d\n", desc->fd_seg_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < desc->fd_seg_cnt; i++) {
+		desc->fd_lens[i] = ceph_decode_32(&p);
+		desc->fd_aligns[i] = ceph_decode_16(&p);
+	}
+
+	/*
+	 * This would fire for FRAME_TAG_WAIT (it has one empty
+	 * segment), but we should never get it as client.
+	 */
+	if (!desc->fd_lens[desc->fd_seg_cnt - 1]) {
+		pr_err("last segment empty\n");
+		return -EINVAL;
+	}
+
+	if (desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) {
+		pr_err("control segment too big %d\n", desc->fd_lens[0]);
+		return -EINVAL;
+	}
+	if (desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) {
+		pr_err("front segment too big %d\n", desc->fd_lens[1]);
+		return -EINVAL;
+	}
+	if (desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) {
+		pr_err("middle segment too big %d\n", desc->fd_lens[2]);
+		return -EINVAL;
+	}
+	if (desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) {
+		pr_err("data segment too big %d\n", desc->fd_lens[3]);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void encode_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+	con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+						 FRAME_LATE_STATUS_COMPLETE;
+	cpu_to_le32s(&con->v2.out_epil.front_crc);
+	cpu_to_le32s(&con->v2.out_epil.middle_crc);
+	cpu_to_le32s(&con->v2.out_epil.data_crc);
+}
+
+static void encode_epilogue_secure(struct ceph_connection *con, bool aborted)
+{
+	memset(&con->v2.out_epil, 0, sizeof(con->v2.out_epil));
+	con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+						 FRAME_LATE_STATUS_COMPLETE;
+}
+
+static int decode_epilogue(void *p, u32 *front_crc, u32 *middle_crc,
+			   u32 *data_crc)
+{
+	u8 late_status;
+
+	late_status = ceph_decode_8(&p);
+	if ((late_status & FRAME_LATE_STATUS_ABORTED_MASK) !=
+			FRAME_LATE_STATUS_COMPLETE) {
+		/* we should never get an aborted message as client */
+		pr_err("bad late_status 0x%x\n", late_status);
+		return -EINVAL;
+	}
+
+	if (front_crc && middle_crc && data_crc) {
+		*front_crc = ceph_decode_32(&p);
+		*middle_crc = ceph_decode_32(&p);
+		*data_crc = ceph_decode_32(&p);
+	}
+
+	return 0;
+}
+
+static void fill_header(struct ceph_msg_header *hdr,
+			const struct ceph_msg_header2 *hdr2,
+			int front_len, int middle_len, int data_len,
+			const struct ceph_entity_name *peer_name)
+{
+	hdr->seq = hdr2->seq;
+	hdr->tid = hdr2->tid;
+	hdr->type = hdr2->type;
+	hdr->priority = hdr2->priority;
+	hdr->version = hdr2->version;
+	hdr->front_len = cpu_to_le32(front_len);
+	hdr->middle_len = cpu_to_le32(middle_len);
+	hdr->data_len = cpu_to_le32(data_len);
+	hdr->data_off = hdr2->data_off;
+	hdr->src = *peer_name;
+	hdr->compat_version = hdr2->compat_version;
+	hdr->reserved = 0;
+	hdr->crc = 0;
+}
+
+static void fill_header2(struct ceph_msg_header2 *hdr2,
+			 const struct ceph_msg_header *hdr, u64 ack_seq)
+{
+	hdr2->seq = hdr->seq;
+	hdr2->tid = hdr->tid;
+	hdr2->type = hdr->type;
+	hdr2->priority = hdr->priority;
+	hdr2->version = hdr->version;
+	hdr2->data_pre_padding_len = 0;
+	hdr2->data_off = hdr->data_off;
+	hdr2->ack_seq = cpu_to_le64(ack_seq);
+	hdr2->flags = 0;
+	hdr2->compat_version = hdr->compat_version;
+	hdr2->reserved = 0;
+}
+
+static int verify_control_crc(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	u32 crc, expected_crc;
+
+	WARN_ON(con->v2.in_kvecs[0].iov_len != ctrl_len);
+	WARN_ON(con->v2.in_kvecs[1].iov_len != CEPH_CRC_LEN);
+
+	crc = crc32c(-1, con->v2.in_kvecs[0].iov_base, ctrl_len);
+	expected_crc = get_unaligned_le32(con->v2.in_kvecs[1].iov_base);
+	if (crc != expected_crc) {
+		pr_err("bad control crc, calculated %u, expected %u\n",
+		       crc, expected_crc);
+		return -EBADMSG;
+	}
+
+	return 0;
+}
+
+static int verify_epilogue_crcs(struct ceph_connection *con, u32 front_crc,
+				u32 middle_crc, u32 data_crc)
+{
+	if (front_len(con->in_msg)) {
+		con->in_front_crc = crc32c(-1, con->in_msg->front.iov_base,
+					   front_len(con->in_msg));
+	} else {
+		WARN_ON(!middle_len(con->in_msg) && !data_len(con->in_msg));
+		con->in_front_crc = -1;
+	}
+
+	if (middle_len(con->in_msg))
+		con->in_middle_crc = crc32c(-1,
+					    con->in_msg->middle->vec.iov_base,
+					    middle_len(con->in_msg));
+	else if (data_len(con->in_msg))
+		con->in_middle_crc = -1;
+	else
+		con->in_middle_crc = 0;
+
+	if (!data_len(con->in_msg))
+		con->in_data_crc = 0;
+
+	dout("%s con %p msg %p crcs %u %u %u\n", __func__, con, con->in_msg,
+	     con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+
+	if (con->in_front_crc != front_crc) {
+		pr_err("bad front crc, calculated %u, expected %u\n",
+		       con->in_front_crc, front_crc);
+		return -EBADMSG;
+	}
+	if (con->in_middle_crc != middle_crc) {
+		pr_err("bad middle crc, calculated %u, expected %u\n",
+		       con->in_middle_crc, middle_crc);
+		return -EBADMSG;
+	}
+	if (con->in_data_crc != data_crc) {
+		pr_err("bad data crc, calculated %u, expected %u\n",
+		       con->in_data_crc, data_crc);
+		return -EBADMSG;
+	}
+
+	return 0;
+}
+
+static int setup_crypto(struct ceph_connection *con,
+			u8 *session_key, int session_key_len,
+			u8 *con_secret, int con_secret_len)
+{
+	unsigned int noio_flag;
+	void *p;
+	int ret;
+
+	dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n",
+	     __func__, con, con->v2.con_mode, session_key_len, con_secret_len);
+	WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req);
+
+	if (con->v2.con_mode != CEPH_CON_MODE_CRC &&
+	    con->v2.con_mode != CEPH_CON_MODE_SECURE) {
+		pr_err("bad con_mode %d\n", con->v2.con_mode);
+		return -EINVAL;
+	}
+
+	if (!session_key_len) {
+		WARN_ON(con->v2.con_mode != CEPH_CON_MODE_CRC);
+		WARN_ON(con_secret_len);
+		return 0;  /* auth_none */
+	}
+
+	noio_flag = memalloc_noio_save();
+	con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
+	memalloc_noio_restore(noio_flag);
+	if (IS_ERR(con->v2.hmac_tfm)) {
+		ret = PTR_ERR(con->v2.hmac_tfm);
+		con->v2.hmac_tfm = NULL;
+		pr_err("failed to allocate hmac tfm context: %d\n", ret);
+		return ret;
+	}
+
+	WARN_ON((unsigned long)session_key &
+		crypto_shash_alignmask(con->v2.hmac_tfm));
+	ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key,
+				  session_key_len);
+	if (ret) {
+		pr_err("failed to set hmac key: %d\n", ret);
+		return ret;
+	}
+
+	if (con->v2.con_mode == CEPH_CON_MODE_CRC) {
+		WARN_ON(con_secret_len);
+		return 0;  /* auth_x, plain mode */
+	}
+
+	if (con_secret_len < CEPH_GCM_KEY_LEN + 2 * CEPH_GCM_IV_LEN) {
+		pr_err("con_secret too small %d\n", con_secret_len);
+		return -EINVAL;
+	}
+
+	noio_flag = memalloc_noio_save();
+	con->v2.gcm_tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+	memalloc_noio_restore(noio_flag);
+	if (IS_ERR(con->v2.gcm_tfm)) {
+		ret = PTR_ERR(con->v2.gcm_tfm);
+		con->v2.gcm_tfm = NULL;
+		pr_err("failed to allocate gcm tfm context: %d\n", ret);
+		return ret;
+	}
+
+	p = con_secret;
+	WARN_ON((unsigned long)p & crypto_aead_alignmask(con->v2.gcm_tfm));
+	ret = crypto_aead_setkey(con->v2.gcm_tfm, p, CEPH_GCM_KEY_LEN);
+	if (ret) {
+		pr_err("failed to set gcm key: %d\n", ret);
+		return ret;
+	}
+
+	p += CEPH_GCM_KEY_LEN;
+	WARN_ON(crypto_aead_ivsize(con->v2.gcm_tfm) != CEPH_GCM_IV_LEN);
+	ret = crypto_aead_setauthsize(con->v2.gcm_tfm, CEPH_GCM_TAG_LEN);
+	if (ret) {
+		pr_err("failed to set gcm tag size: %d\n", ret);
+		return ret;
+	}
+
+	con->v2.gcm_req = aead_request_alloc(con->v2.gcm_tfm, GFP_NOIO);
+	if (!con->v2.gcm_req) {
+		pr_err("failed to allocate gcm request\n");
+		return -ENOMEM;
+	}
+
+	crypto_init_wait(&con->v2.gcm_wait);
+	aead_request_set_callback(con->v2.gcm_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				  crypto_req_done, &con->v2.gcm_wait);
+
+	memcpy(&con->v2.in_gcm_nonce, p, CEPH_GCM_IV_LEN);
+	memcpy(&con->v2.out_gcm_nonce, p + CEPH_GCM_IV_LEN, CEPH_GCM_IV_LEN);
+	return 0;  /* auth_x, secure mode */
+}
+
+static int hmac_sha256(struct ceph_connection *con, const struct kvec *kvecs,
+		       int kvec_cnt, u8 *hmac)
+{
+	SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm);  /* tfm arg is ignored */
+	int ret;
+	int i;
+
+	dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con,
+	     con->v2.hmac_tfm, kvec_cnt);
+
+	if (!con->v2.hmac_tfm) {
+		memset(hmac, 0, SHA256_DIGEST_SIZE);
+		return 0;  /* auth_none */
+	}
+
+	desc->tfm = con->v2.hmac_tfm;
+	ret = crypto_shash_init(desc);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < kvec_cnt; i++) {
+		WARN_ON((unsigned long)kvecs[i].iov_base &
+			crypto_shash_alignmask(con->v2.hmac_tfm));
+		ret = crypto_shash_update(desc, kvecs[i].iov_base,
+					  kvecs[i].iov_len);
+		if (ret)
+			return ret;
+	}
+
+	ret = crypto_shash_final(desc, hmac);
+	if (ret)
+		return ret;
+
+	shash_desc_zero(desc);
+	return 0;  /* auth_x, both plain and secure modes */
+}
+
+static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce)
+{
+	u64 counter;
+
+	counter = le64_to_cpu(nonce->counter);
+	nonce->counter = cpu_to_le64(counter + 1);
+}
+
+static int gcm_crypt(struct ceph_connection *con, bool encrypt,
+		     struct scatterlist *src, struct scatterlist *dst,
+		     int src_len)
+{
+	struct ceph_gcm_nonce *nonce;
+	int ret;
+
+	nonce = encrypt ? &con->v2.out_gcm_nonce : &con->v2.in_gcm_nonce;
+
+	aead_request_set_ad(con->v2.gcm_req, 0);  /* no AAD */
+	aead_request_set_crypt(con->v2.gcm_req, src, dst, src_len, (u8 *)nonce);
+	ret = crypto_wait_req(encrypt ? crypto_aead_encrypt(con->v2.gcm_req) :
+					crypto_aead_decrypt(con->v2.gcm_req),
+			      &con->v2.gcm_wait);
+	if (ret)
+		return ret;
+
+	gcm_inc_nonce(nonce);
+	return 0;
+}
+
+static void get_bvec_at(struct ceph_msg_data_cursor *cursor,
+			struct bio_vec *bv)
+{
+	struct page *page;
+	size_t off, len;
+
+	WARN_ON(!cursor->total_resid);
+
+	/* skip zero-length data items */
+	while (!cursor->resid)
+		ceph_msg_data_advance(cursor, 0);
+
+	/* get a piece of data, cursor isn't advanced */
+	page = ceph_msg_data_next(cursor, &off, &len, NULL);
+
+	bv->bv_page = page;
+	bv->bv_offset = off;
+	bv->bv_len = len;
+}
+
+static int calc_sg_cnt(void *buf, int buf_len)
+{
+	int sg_cnt;
+
+	if (!buf_len)
+		return 0;
+
+	sg_cnt = need_padding(buf_len) ? 1 : 0;
+	if (is_vmalloc_addr(buf)) {
+		WARN_ON(offset_in_page(buf));
+		sg_cnt += PAGE_ALIGN(buf_len) >> PAGE_SHIFT;
+	} else {
+		sg_cnt++;
+	}
+
+	return sg_cnt;
+}
+
+static int calc_sg_cnt_cursor(struct ceph_msg_data_cursor *cursor)
+{
+	int data_len = cursor->total_resid;
+	struct bio_vec bv;
+	int sg_cnt;
+
+	if (!data_len)
+		return 0;
+
+	sg_cnt = need_padding(data_len) ? 1 : 0;
+	do {
+		get_bvec_at(cursor, &bv);
+		sg_cnt++;
+
+		ceph_msg_data_advance(cursor, bv.bv_len);
+	} while (cursor->total_resid);
+
+	return sg_cnt;
+}
+
+static void init_sgs(struct scatterlist **sg, void *buf, int buf_len, u8 *pad)
+{
+	void *end = buf + buf_len;
+	struct page *page;
+	int len;
+	void *p;
+
+	if (!buf_len)
+		return;
+
+	if (is_vmalloc_addr(buf)) {
+		p = buf;
+		do {
+			page = vmalloc_to_page(p);
+			len = min_t(int, end - p, PAGE_SIZE);
+			WARN_ON(!page || !len || offset_in_page(p));
+			sg_set_page(*sg, page, len, 0);
+			*sg = sg_next(*sg);
+			p += len;
+		} while (p != end);
+	} else {
+		sg_set_buf(*sg, buf, buf_len);
+		*sg = sg_next(*sg);
+	}
+
+	if (need_padding(buf_len)) {
+		sg_set_buf(*sg, pad, padding_len(buf_len));
+		*sg = sg_next(*sg);
+	}
+}
+
+static void init_sgs_cursor(struct scatterlist **sg,
+			    struct ceph_msg_data_cursor *cursor, u8 *pad)
+{
+	int data_len = cursor->total_resid;
+	struct bio_vec bv;
+
+	if (!data_len)
+		return;
+
+	do {
+		get_bvec_at(cursor, &bv);
+		sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
+		*sg = sg_next(*sg);
+
+		ceph_msg_data_advance(cursor, bv.bv_len);
+	} while (cursor->total_resid);
+
+	if (need_padding(data_len)) {
+		sg_set_buf(*sg, pad, padding_len(data_len));
+		*sg = sg_next(*sg);
+	}
+}
+
+static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg,
+			     u8 *front_pad, u8 *middle_pad, u8 *data_pad,
+			     void *epilogue, bool add_tag)
+{
+	struct ceph_msg_data_cursor cursor;
+	struct scatterlist *cur_sg;
+	int sg_cnt;
+	int ret;
+
+	if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
+		return 0;
+
+	sg_cnt = 1;  /* epilogue + [auth tag] */
+	if (front_len(msg))
+		sg_cnt += calc_sg_cnt(msg->front.iov_base,
+				      front_len(msg));
+	if (middle_len(msg))
+		sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base,
+				      middle_len(msg));
+	if (data_len(msg)) {
+		ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
+		sg_cnt += calc_sg_cnt_cursor(&cursor);
+	}
+
+	ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO);
+	if (ret)
+		return ret;
+
+	cur_sg = sgt->sgl;
+	if (front_len(msg))
+		init_sgs(&cur_sg, msg->front.iov_base, front_len(msg),
+			 front_pad);
+	if (middle_len(msg))
+		init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg),
+			 middle_pad);
+	if (data_len(msg)) {
+		ceph_msg_data_cursor_init(&cursor, msg, data_len(msg));
+		init_sgs_cursor(&cur_sg, &cursor, data_pad);
+	}
+
+	WARN_ON(!sg_is_last(cur_sg));
+	sg_set_buf(cur_sg, epilogue,
+		   CEPH_GCM_BLOCK_LEN + (add_tag ? CEPH_GCM_TAG_LEN : 0));
+	return 0;
+}
+
+static int decrypt_preamble(struct ceph_connection *con)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, con->v2.in_buf, CEPH_PREAMBLE_SECURE_LEN);
+	return gcm_crypt(con, false, &sg, &sg, CEPH_PREAMBLE_SECURE_LEN);
+}
+
+static int decrypt_control_remainder(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+	int pt_len = padding_len(rem_len) + CEPH_GCM_TAG_LEN;
+	struct scatterlist sgs[2];
+
+	WARN_ON(con->v2.in_kvecs[0].iov_len != rem_len);
+	WARN_ON(con->v2.in_kvecs[1].iov_len != pt_len);
+
+	sg_init_table(sgs, 2);
+	sg_set_buf(&sgs[0], con->v2.in_kvecs[0].iov_base, rem_len);
+	sg_set_buf(&sgs[1], con->v2.in_buf, pt_len);
+
+	return gcm_crypt(con, false, sgs, sgs,
+			 padded_len(rem_len) + CEPH_GCM_TAG_LEN);
+}
+
+static int decrypt_message(struct ceph_connection *con)
+{
+	struct sg_table sgt = {};
+	int ret;
+
+	ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf),
+			MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf),
+			con->v2.in_buf, true);
+	if (ret)
+		goto out;
+
+	ret = gcm_crypt(con, false, sgt.sgl, sgt.sgl,
+			tail_onwire_len(con->in_msg, true));
+
+out:
+	sg_free_table(&sgt);
+	return ret;
+}
+
+static int prepare_banner(struct ceph_connection *con)
+{
+	int buf_len = CEPH_BANNER_V2_LEN + 2 + 8 + 8;
+	void *buf, *p;
+
+	buf = alloc_conn_buf(con, buf_len);
+	if (!buf)
+		return -ENOMEM;
+
+	p = buf;
+	ceph_encode_copy(&p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN);
+	ceph_encode_16(&p, sizeof(u64) + sizeof(u64));
+	ceph_encode_64(&p, CEPH_MSGR2_SUPPORTED_FEATURES);
+	ceph_encode_64(&p, CEPH_MSGR2_REQUIRED_FEATURES);
+	WARN_ON(p != buf + buf_len);
+
+	add_out_kvec(con, buf, buf_len);
+	add_out_sign_kvec(con, buf, buf_len);
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+/*
+ * base:
+ *   preamble
+ *   control body (ctrl_len bytes)
+ *   space for control crc
+ *
+ * extdata (optional):
+ *   control body (extdata_len bytes)
+ *
+ * Compute control crc and gather base and extdata into:
+ *
+ *   preamble
+ *   control body (ctrl_len + extdata_len bytes)
+ *   control crc
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static void prepare_head_plain(struct ceph_connection *con, void *base,
+			       int ctrl_len, void *extdata, int extdata_len,
+			       bool to_be_signed)
+{
+	int base_len = CEPH_PREAMBLE_LEN + ctrl_len + CEPH_CRC_LEN;
+	void *crcp = base + base_len - CEPH_CRC_LEN;
+	u32 crc;
+
+	crc = crc32c(-1, CTRL_BODY(base), ctrl_len);
+	if (extdata_len)
+		crc = crc32c(crc, extdata, extdata_len);
+	put_unaligned_le32(crc, crcp);
+
+	if (!extdata_len) {
+		add_out_kvec(con, base, base_len);
+		if (to_be_signed)
+			add_out_sign_kvec(con, base, base_len);
+		return;
+	}
+
+	add_out_kvec(con, base, crcp - base);
+	add_out_kvec(con, extdata, extdata_len);
+	add_out_kvec(con, crcp, CEPH_CRC_LEN);
+	if (to_be_signed) {
+		add_out_sign_kvec(con, base, crcp - base);
+		add_out_sign_kvec(con, extdata, extdata_len);
+		add_out_sign_kvec(con, crcp, CEPH_CRC_LEN);
+	}
+}
+
+static int prepare_head_secure_small(struct ceph_connection *con,
+				     void *base, int ctrl_len)
+{
+	struct scatterlist sg;
+	int ret;
+
+	/* inline buffer padding? */
+	if (ctrl_len < CEPH_PREAMBLE_INLINE_LEN)
+		memset(CTRL_BODY(base) + ctrl_len, 0,
+		       CEPH_PREAMBLE_INLINE_LEN - ctrl_len);
+
+	sg_init_one(&sg, base, CEPH_PREAMBLE_SECURE_LEN);
+	ret = gcm_crypt(con, true, &sg, &sg,
+			CEPH_PREAMBLE_SECURE_LEN - CEPH_GCM_TAG_LEN);
+	if (ret)
+		return ret;
+
+	add_out_kvec(con, base, CEPH_PREAMBLE_SECURE_LEN);
+	return 0;
+}
+
+/*
+ * base:
+ *   preamble
+ *   control body (ctrl_len bytes)
+ *   space for padding, if needed
+ *   space for control remainder auth tag
+ *   space for preamble auth tag
+ *
+ * Encrypt preamble and the inline portion, then encrypt the remainder
+ * and gather into:
+ *
+ *   preamble
+ *   control body (48 bytes)
+ *   preamble auth tag
+ *   control body (ctrl_len - 48 bytes)
+ *   zero padding, if needed
+ *   control remainder auth tag
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static int prepare_head_secure_big(struct ceph_connection *con,
+				   void *base, int ctrl_len)
+{
+	int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+	void *rem = CTRL_BODY(base) + CEPH_PREAMBLE_INLINE_LEN;
+	void *rem_tag = rem + padded_len(rem_len);
+	void *pmbl_tag = rem_tag + CEPH_GCM_TAG_LEN;
+	struct scatterlist sgs[2];
+	int ret;
+
+	sg_init_table(sgs, 2);
+	sg_set_buf(&sgs[0], base, rem - base);
+	sg_set_buf(&sgs[1], pmbl_tag, CEPH_GCM_TAG_LEN);
+	ret = gcm_crypt(con, true, sgs, sgs, rem - base);
+	if (ret)
+		return ret;
+
+	/* control remainder padding? */
+	if (need_padding(rem_len))
+		memset(rem + rem_len, 0, padding_len(rem_len));
+
+	sg_init_one(&sgs[0], rem, pmbl_tag - rem);
+	ret = gcm_crypt(con, true, sgs, sgs, rem_tag - rem);
+	if (ret)
+		return ret;
+
+	add_out_kvec(con, base, rem - base);
+	add_out_kvec(con, pmbl_tag, CEPH_GCM_TAG_LEN);
+	add_out_kvec(con, rem, pmbl_tag - rem);
+	return 0;
+}
+
+static int __prepare_control(struct ceph_connection *con, int tag,
+			     void *base, int ctrl_len, void *extdata,
+			     int extdata_len, bool to_be_signed)
+{
+	int total_len = ctrl_len + extdata_len;
+	struct ceph_frame_desc desc;
+	int ret;
+
+	dout("%s con %p tag %d len %d (%d+%d)\n", __func__, con, tag,
+	     total_len, ctrl_len, extdata_len);
+
+	/* extdata may be vmalloc'ed but not base */
+	if (WARN_ON(is_vmalloc_addr(base) || !ctrl_len))
+		return -EINVAL;
+
+	init_frame_desc(&desc, tag, &total_len, 1);
+	encode_preamble(&desc, base);
+
+	if (con_secure(con)) {
+		if (WARN_ON(extdata_len || to_be_signed))
+			return -EINVAL;
+
+		if (ctrl_len <= CEPH_PREAMBLE_INLINE_LEN)
+			/* fully inlined, inline buffer may need padding */
+			ret = prepare_head_secure_small(con, base, ctrl_len);
+		else
+			/* partially inlined, inline buffer is full */
+			ret = prepare_head_secure_big(con, base, ctrl_len);
+		if (ret)
+			return ret;
+	} else {
+		prepare_head_plain(con, base, ctrl_len, extdata, extdata_len,
+				   to_be_signed);
+	}
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+static int prepare_control(struct ceph_connection *con, int tag,
+			   void *base, int ctrl_len)
+{
+	return __prepare_control(con, tag, base, ctrl_len, NULL, 0, false);
+}
+
+static int prepare_hello(struct ceph_connection *con)
+{
+	void *buf, *p;
+	int ctrl_len;
+
+	ctrl_len = 1 + ceph_entity_addr_encoding_len(&con->peer_addr);
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+	if (!buf)
+		return -ENOMEM;
+
+	p = CTRL_BODY(buf);
+	ceph_encode_8(&p, CEPH_ENTITY_TYPE_CLIENT);
+	ceph_encode_entity_addr(&p, &con->peer_addr);
+	WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+	return __prepare_control(con, FRAME_TAG_HELLO, buf, ctrl_len,
+				 NULL, 0, true);
+}
+
+/* so that head_onwire_len(AUTH_BUF_LEN, false) is 512 */
+#define AUTH_BUF_LEN	(512 - CEPH_CRC_LEN - CEPH_PREAMBLE_PLAIN_LEN)
+
+static int prepare_auth_request(struct ceph_connection *con)
+{
+	void *authorizer, *authorizer_copy;
+	int ctrl_len, authorizer_len;
+	void *buf;
+	int ret;
+
+	ctrl_len = AUTH_BUF_LEN;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+	if (!buf)
+		return -ENOMEM;
+
+	mutex_unlock(&con->mutex);
+	ret = con->ops->get_auth_request(con, CTRL_BODY(buf), &ctrl_len,
+					 &authorizer, &authorizer_len);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_HELLO) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p get_auth_request ret %d\n", __func__, con, ret);
+	if (ret)
+		return ret;
+
+	authorizer_copy = alloc_conn_buf(con, authorizer_len);
+	if (!authorizer_copy)
+		return -ENOMEM;
+
+	memcpy(authorizer_copy, authorizer, authorizer_len);
+
+	return __prepare_control(con, FRAME_TAG_AUTH_REQUEST, buf, ctrl_len,
+				 authorizer_copy, authorizer_len, true);
+}
+
+static int prepare_auth_request_more(struct ceph_connection *con,
+				     void *reply, int reply_len)
+{
+	int ctrl_len, authorizer_len;
+	void *authorizer;
+	void *buf;
+	int ret;
+
+	ctrl_len = AUTH_BUF_LEN;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+	if (!buf)
+		return -ENOMEM;
+
+	mutex_unlock(&con->mutex);
+	ret = con->ops->handle_auth_reply_more(con, reply, reply_len,
+					       CTRL_BODY(buf), &ctrl_len,
+					       &authorizer, &authorizer_len);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p handle_auth_reply_more ret %d\n", __func__, con, ret);
+	if (ret)
+		return ret;
+
+	return __prepare_control(con, FRAME_TAG_AUTH_REQUEST_MORE, buf,
+				 ctrl_len, authorizer, authorizer_len, true);
+}
+
+static int prepare_auth_signature(struct ceph_connection *con)
+{
+	void *buf;
+	int ret;
+
+	buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, false));
+	if (!buf)
+		return -ENOMEM;
+
+	ret = hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt,
+			  CTRL_BODY(buf));
+	if (ret)
+		return ret;
+
+	return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf,
+			       SHA256_DIGEST_SIZE);
+}
+
+static int prepare_client_ident(struct ceph_connection *con)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+	struct ceph_client *client = from_msgr(con->msgr);
+	u64 global_id = ceph_client_gid(client);
+	void *buf, *p;
+	int ctrl_len;
+
+	WARN_ON(con->v2.server_cookie);
+	WARN_ON(con->v2.connect_seq);
+	WARN_ON(con->v2.peer_global_seq);
+
+	if (!con->v2.client_cookie) {
+		do {
+			get_random_bytes(&con->v2.client_cookie,
+					 sizeof(con->v2.client_cookie));
+		} while (!con->v2.client_cookie);
+		dout("%s con %p generated cookie 0x%llx\n", __func__, con,
+		     con->v2.client_cookie);
+	} else {
+		dout("%s con %p cookie already set 0x%llx\n", __func__, con,
+		     con->v2.client_cookie);
+	}
+
+	dout("%s con %p my_addr %s/%u peer_addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx cookie 0x%llx\n",
+	     __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+	     ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->peer_addr.nonce),
+	     global_id, con->v2.global_seq, client->supported_features,
+	     client->required_features, con->v2.client_cookie);
+
+	ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) +
+		   ceph_entity_addr_encoding_len(&con->peer_addr) + 6 * 8;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+	if (!buf)
+		return -ENOMEM;
+
+	p = CTRL_BODY(buf);
+	ceph_encode_8(&p, 2);  /* addrvec marker */
+	ceph_encode_32(&p, 1);  /* addr_cnt */
+	ceph_encode_entity_addr(&p, my_addr);
+	ceph_encode_entity_addr(&p, &con->peer_addr);
+	ceph_encode_64(&p, global_id);
+	ceph_encode_64(&p, con->v2.global_seq);
+	ceph_encode_64(&p, client->supported_features);
+	ceph_encode_64(&p, client->required_features);
+	ceph_encode_64(&p, 0);  /* flags */
+	ceph_encode_64(&p, con->v2.client_cookie);
+	WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+	return prepare_control(con, FRAME_TAG_CLIENT_IDENT, buf, ctrl_len);
+}
+
+static int prepare_session_reconnect(struct ceph_connection *con)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+	void *buf, *p;
+	int ctrl_len;
+
+	WARN_ON(!con->v2.client_cookie);
+	WARN_ON(!con->v2.server_cookie);
+	WARN_ON(!con->v2.connect_seq);
+	WARN_ON(!con->v2.peer_global_seq);
+
+	dout("%s con %p my_addr %s/%u client_cookie 0x%llx server_cookie 0x%llx global_seq %llu connect_seq %llu in_seq %llu\n",
+	     __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+	     con->v2.client_cookie, con->v2.server_cookie, con->v2.global_seq,
+	     con->v2.connect_seq, con->in_seq);
+
+	ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + 5 * 8;
+	buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+	if (!buf)
+		return -ENOMEM;
+
+	p = CTRL_BODY(buf);
+	ceph_encode_8(&p, 2);  /* entity_addrvec_t marker */
+	ceph_encode_32(&p, 1);  /* my_addrs len */
+	ceph_encode_entity_addr(&p, my_addr);
+	ceph_encode_64(&p, con->v2.client_cookie);
+	ceph_encode_64(&p, con->v2.server_cookie);
+	ceph_encode_64(&p, con->v2.global_seq);
+	ceph_encode_64(&p, con->v2.connect_seq);
+	ceph_encode_64(&p, con->in_seq);
+	WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+	return prepare_control(con, FRAME_TAG_SESSION_RECONNECT, buf, ctrl_len);
+}
+
+static int prepare_keepalive2(struct ceph_connection *con)
+{
+	struct ceph_timespec *ts = CTRL_BODY(con->v2.out_buf);
+	struct timespec64 now;
+
+	ktime_get_real_ts64(&now);
+	dout("%s con %p timestamp %lld.%09ld\n", __func__, con, now.tv_sec,
+	     now.tv_nsec);
+
+	ceph_encode_timespec64(ts, &now);
+
+	reset_out_kvecs(con);
+	return prepare_control(con, FRAME_TAG_KEEPALIVE2, con->v2.out_buf,
+			       sizeof(struct ceph_timespec));
+}
+
+static int prepare_ack(struct ceph_connection *con)
+{
+	void *p;
+
+	dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	p = CTRL_BODY(con->v2.out_buf);
+	ceph_encode_64(&p, con->in_seq_acked);
+
+	reset_out_kvecs(con);
+	return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8);
+}
+
+static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+	dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con,
+	     con->out_msg, aborted, con->v2.out_epil.front_crc,
+	     con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc);
+
+	encode_epilogue_plain(con, aborted);
+	add_out_kvec(con, &con->v2.out_epil, CEPH_EPILOGUE_PLAIN_LEN);
+}
+
+/*
+ * For "used" empty segments, crc is -1.  For unused (trailing)
+ * segments, crc is 0.
+ */
+static void prepare_message_plain(struct ceph_connection *con)
+{
+	struct ceph_msg *msg = con->out_msg;
+
+	prepare_head_plain(con, con->v2.out_buf,
+			   sizeof(struct ceph_msg_header2), NULL, 0, false);
+
+	if (!front_len(msg) && !middle_len(msg)) {
+		if (!data_len(msg)) {
+			/*
+			 * Empty message: once the head is written,
+			 * we are done -- there is no epilogue.
+			 */
+			con->v2.out_state = OUT_S_FINISH_MESSAGE;
+			return;
+		}
+
+		con->v2.out_epil.front_crc = -1;
+		con->v2.out_epil.middle_crc = -1;
+		con->v2.out_state = OUT_S_QUEUE_DATA;
+		return;
+	}
+
+	if (front_len(msg)) {
+		con->v2.out_epil.front_crc = crc32c(-1, msg->front.iov_base,
+						    front_len(msg));
+		add_out_kvec(con, msg->front.iov_base, front_len(msg));
+	} else {
+		/* middle (at least) is there, checked above */
+		con->v2.out_epil.front_crc = -1;
+	}
+
+	if (middle_len(msg)) {
+		con->v2.out_epil.middle_crc =
+			crc32c(-1, msg->middle->vec.iov_base, middle_len(msg));
+		add_out_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+	} else {
+		con->v2.out_epil.middle_crc = data_len(msg) ? -1 : 0;
+	}
+
+	if (data_len(msg)) {
+		con->v2.out_state = OUT_S_QUEUE_DATA;
+	} else {
+		con->v2.out_epil.data_crc = 0;
+		prepare_epilogue_plain(con, false);
+		con->v2.out_state = OUT_S_FINISH_MESSAGE;
+	}
+}
+
+/*
+ * Unfortunately the kernel crypto API doesn't support streaming
+ * (piecewise) operation for AEAD algorithms, so we can't get away
+ * with a fixed size buffer and a couple sgs.  Instead, we have to
+ * allocate pages for the entire tail of the message (currently up
+ * to ~32M) and two sgs arrays (up to ~256K each)...
+ */
+static int prepare_message_secure(struct ceph_connection *con)
+{
+	void *zerop = page_address(ceph_zero_page);
+	struct sg_table enc_sgt = {};
+	struct sg_table sgt = {};
+	struct page **enc_pages;
+	int enc_page_cnt;
+	int tail_len;
+	int ret;
+
+	ret = prepare_head_secure_small(con, con->v2.out_buf,
+					sizeof(struct ceph_msg_header2));
+	if (ret)
+		return ret;
+
+	tail_len = tail_onwire_len(con->out_msg, true);
+	if (!tail_len) {
+		/*
+		 * Empty message: once the head is written,
+		 * we are done -- there is no epilogue.
+		 */
+		con->v2.out_state = OUT_S_FINISH_MESSAGE;
+		return 0;
+	}
+
+	encode_epilogue_secure(con, false);
+	ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop,
+				&con->v2.out_epil, false);
+	if (ret)
+		goto out;
+
+	enc_page_cnt = calc_pages_for(0, tail_len);
+	enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
+	if (IS_ERR(enc_pages)) {
+		ret = PTR_ERR(enc_pages);
+		goto out;
+	}
+
+	WARN_ON(con->v2.out_enc_pages || con->v2.out_enc_page_cnt);
+	con->v2.out_enc_pages = enc_pages;
+	con->v2.out_enc_page_cnt = enc_page_cnt;
+	con->v2.out_enc_resid = tail_len;
+	con->v2.out_enc_i = 0;
+
+	ret = sg_alloc_table_from_pages(&enc_sgt, enc_pages, enc_page_cnt,
+					0, tail_len, GFP_NOIO);
+	if (ret)
+		goto out;
+
+	ret = gcm_crypt(con, true, sgt.sgl, enc_sgt.sgl,
+			tail_len - CEPH_GCM_TAG_LEN);
+	if (ret)
+		goto out;
+
+	dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con,
+	     con->out_msg, sgt.orig_nents, enc_page_cnt);
+	con->v2.out_state = OUT_S_QUEUE_ENC_PAGE;
+
+out:
+	sg_free_table(&sgt);
+	sg_free_table(&enc_sgt);
+	return ret;
+}
+
+static int prepare_message(struct ceph_connection *con)
+{
+	int lens[] = {
+		sizeof(struct ceph_msg_header2),
+		front_len(con->out_msg),
+		middle_len(con->out_msg),
+		data_len(con->out_msg)
+	};
+	struct ceph_frame_desc desc;
+	int ret;
+
+	dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con,
+	     con->out_msg, lens[0], lens[1], lens[2], lens[3]);
+
+	if (con->in_seq > con->in_seq_acked) {
+		dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+		     con->in_seq_acked, con->in_seq);
+		con->in_seq_acked = con->in_seq;
+	}
+
+	reset_out_kvecs(con);
+	init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4);
+	encode_preamble(&desc, con->v2.out_buf);
+	fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr,
+		     con->in_seq_acked);
+
+	if (con_secure(con)) {
+		ret = prepare_message_secure(con);
+		if (ret)
+			return ret;
+	} else {
+		prepare_message_plain(con);
+	}
+
+	ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+static int prepare_read_banner_prefix(struct ceph_connection *con)
+{
+	void *buf;
+
+	buf = alloc_conn_buf(con, CEPH_BANNER_V2_PREFIX_LEN);
+	if (!buf)
+		return -ENOMEM;
+
+	reset_in_kvecs(con);
+	add_in_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+	add_in_sign_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+	con->state = CEPH_CON_S_V2_BANNER_PREFIX;
+	return 0;
+}
+
+static int prepare_read_banner_payload(struct ceph_connection *con,
+				       int payload_len)
+{
+	void *buf;
+
+	buf = alloc_conn_buf(con, payload_len);
+	if (!buf)
+		return -ENOMEM;
+
+	reset_in_kvecs(con);
+	add_in_kvec(con, buf, payload_len);
+	add_in_sign_kvec(con, buf, payload_len);
+	con->state = CEPH_CON_S_V2_BANNER_PAYLOAD;
+	return 0;
+}
+
+static void prepare_read_preamble(struct ceph_connection *con)
+{
+	reset_in_kvecs(con);
+	add_in_kvec(con, con->v2.in_buf,
+		    con_secure(con) ? CEPH_PREAMBLE_SECURE_LEN :
+				      CEPH_PREAMBLE_PLAIN_LEN);
+	con->v2.in_state = IN_S_HANDLE_PREAMBLE;
+}
+
+static int prepare_read_control(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	int head_len;
+	void *buf;
+
+	reset_in_kvecs(con);
+	if (con->state == CEPH_CON_S_V2_HELLO ||
+	    con->state == CEPH_CON_S_V2_AUTH) {
+		head_len = head_onwire_len(ctrl_len, false);
+		buf = alloc_conn_buf(con, head_len);
+		if (!buf)
+			return -ENOMEM;
+
+		/* preserve preamble */
+		memcpy(buf, con->v2.in_buf, CEPH_PREAMBLE_LEN);
+
+		add_in_kvec(con, CTRL_BODY(buf), ctrl_len);
+		add_in_kvec(con, CTRL_BODY(buf) + ctrl_len, CEPH_CRC_LEN);
+		add_in_sign_kvec(con, buf, head_len);
+	} else {
+		if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+			buf = alloc_conn_buf(con, ctrl_len);
+			if (!buf)
+				return -ENOMEM;
+
+			add_in_kvec(con, buf, ctrl_len);
+		} else {
+			add_in_kvec(con, CTRL_BODY(con->v2.in_buf), ctrl_len);
+		}
+		add_in_kvec(con, con->v2.in_buf, CEPH_CRC_LEN);
+	}
+	con->v2.in_state = IN_S_HANDLE_CONTROL;
+	return 0;
+}
+
+static int prepare_read_control_remainder(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+	void *buf;
+
+	buf = alloc_conn_buf(con, ctrl_len);
+	if (!buf)
+		return -ENOMEM;
+
+	memcpy(buf, CTRL_BODY(con->v2.in_buf), CEPH_PREAMBLE_INLINE_LEN);
+
+	reset_in_kvecs(con);
+	add_in_kvec(con, buf + CEPH_PREAMBLE_INLINE_LEN, rem_len);
+	add_in_kvec(con, con->v2.in_buf,
+		    padding_len(rem_len) + CEPH_GCM_TAG_LEN);
+	con->v2.in_state = IN_S_HANDLE_CONTROL_REMAINDER;
+	return 0;
+}
+
+static void prepare_read_data(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	if (!con_secure(con))
+		con->in_data_crc = -1;
+	ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg,
+				  data_len(con->in_msg));
+
+	get_bvec_at(&con->v2.in_cursor, &bv);
+	set_in_bvec(con, &bv);
+	con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT;
+}
+
+static void prepare_read_data_cont(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	if (!con_secure(con))
+		con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
+						    con->v2.in_bvec.bv_page,
+						    con->v2.in_bvec.bv_offset,
+						    con->v2.in_bvec.bv_len);
+
+	ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len);
+	if (con->v2.in_cursor.total_resid) {
+		get_bvec_at(&con->v2.in_cursor, &bv);
+		set_in_bvec(con, &bv);
+		WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT);
+		return;
+	}
+
+	/*
+	 * We've read all data.  Prepare to read data padding (if any)
+	 * and epilogue.
+	 */
+	reset_in_kvecs(con);
+	if (con_secure(con)) {
+		if (need_padding(data_len(con->in_msg)))
+			add_in_kvec(con, DATA_PAD(con->v2.in_buf),
+				    padding_len(data_len(con->in_msg)));
+		add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_SECURE_LEN);
+	} else {
+		add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+	}
+	con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+}
+
+static void __finish_skip(struct ceph_connection *con)
+{
+	con->in_seq++;
+	prepare_read_preamble(con);
+}
+
+static void prepare_skip_message(struct ceph_connection *con)
+{
+	struct ceph_frame_desc *desc = &con->v2.in_desc;
+	int tail_len;
+
+	dout("%s con %p %d+%d+%d\n", __func__, con, desc->fd_lens[1],
+	     desc->fd_lens[2], desc->fd_lens[3]);
+
+	tail_len = __tail_onwire_len(desc->fd_lens[1], desc->fd_lens[2],
+				     desc->fd_lens[3], con_secure(con));
+	if (!tail_len) {
+		__finish_skip(con);
+	} else {
+		set_in_skip(con, tail_len);
+		con->v2.in_state = IN_S_FINISH_SKIP;
+	}
+}
+
+static int process_banner_prefix(struct ceph_connection *con)
+{
+	int payload_len;
+	void *p;
+
+	WARN_ON(con->v2.in_kvecs[0].iov_len != CEPH_BANNER_V2_PREFIX_LEN);
+
+	p = con->v2.in_kvecs[0].iov_base;
+	if (memcmp(p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN)) {
+		if (!memcmp(p, CEPH_BANNER, CEPH_BANNER_LEN))
+			con->error_msg = "server is speaking msgr1 protocol";
+		else
+			con->error_msg = "protocol error, bad banner";
+		return -EINVAL;
+	}
+
+	p += CEPH_BANNER_V2_LEN;
+	payload_len = ceph_decode_16(&p);
+	dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+	return prepare_read_banner_payload(con, payload_len);
+}
+
+static int process_banner_payload(struct ceph_connection *con)
+{
+	void *end = con->v2.in_kvecs[0].iov_base + con->v2.in_kvecs[0].iov_len;
+	u64 feat = CEPH_MSGR2_SUPPORTED_FEATURES;
+	u64 req_feat = CEPH_MSGR2_REQUIRED_FEATURES;
+	u64 server_feat, server_req_feat;
+	void *p;
+	int ret;
+
+	p = con->v2.in_kvecs[0].iov_base;
+	ceph_decode_64_safe(&p, end, server_feat, bad);
+	ceph_decode_64_safe(&p, end, server_req_feat, bad);
+
+	dout("%s con %p server_feat 0x%llx server_req_feat 0x%llx\n",
+	     __func__, con, server_feat, server_req_feat);
+
+	if (req_feat & ~server_feat) {
+		pr_err("msgr2 feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+		       server_feat, req_feat & ~server_feat);
+		con->error_msg = "missing required protocol features";
+		return -EINVAL;
+	}
+	if (server_req_feat & ~feat) {
+		pr_err("msgr2 feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+		       feat, server_req_feat & ~feat);
+		con->error_msg = "missing required protocol features";
+		return -EINVAL;
+	}
+
+	/* no reset_out_kvecs() as our banner may still be pending */
+	ret = prepare_hello(con);
+	if (ret) {
+		pr_err("prepare_hello failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_HELLO;
+	prepare_read_preamble(con);
+	return 0;
+
+bad:
+	pr_err("failed to decode banner payload\n");
+	return -EINVAL;
+}
+
+static int process_hello(struct ceph_connection *con, void *p, void *end)
+{
+	struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+	struct ceph_entity_addr addr_for_me;
+	u8 entity_type;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_HELLO) {
+		con->error_msg = "protocol error, unexpected hello";
+		return -EINVAL;
+	}
+
+	ceph_decode_8_safe(&p, end, entity_type, bad);
+	ret = ceph_decode_entity_addr(&p, end, &addr_for_me);
+	if (ret) {
+		pr_err("failed to decode addr_for_me: %d\n", ret);
+		return ret;
+	}
+
+	dout("%s con %p entity_type %d addr_for_me %s\n", __func__, con,
+	     entity_type, ceph_pr_addr(&addr_for_me));
+
+	if (entity_type != con->peer_name.type) {
+		pr_err("bad peer type, want %d, got %d\n",
+		       con->peer_name.type, entity_type);
+		con->error_msg = "wrong peer at address";
+		return -EINVAL;
+	}
+
+	/*
+	 * Set our address to the address our first peer (i.e. monitor)
+	 * sees that we are connecting from.  If we are behind some sort
+	 * of NAT and want to be identified by some private (not NATed)
+	 * address, ip option should be used.
+	 */
+	if (ceph_addr_is_blank(my_addr)) {
+		memcpy(&my_addr->in_addr, &addr_for_me.in_addr,
+		       sizeof(my_addr->in_addr));
+		ceph_addr_set_port(my_addr, 0);
+		dout("%s con %p set my addr %s, as seen by peer %s\n",
+		     __func__, con, ceph_pr_addr(my_addr),
+		     ceph_pr_addr(&con->peer_addr));
+	} else {
+		dout("%s con %p my addr already set %s\n",
+		     __func__, con, ceph_pr_addr(my_addr));
+	}
+
+	WARN_ON(ceph_addr_is_blank(my_addr) || ceph_addr_port(my_addr));
+	WARN_ON(my_addr->type != CEPH_ENTITY_ADDR_TYPE_ANY);
+	WARN_ON(!my_addr->nonce);
+
+	/* no reset_out_kvecs() as our hello may still be pending */
+	ret = prepare_auth_request(con);
+	if (ret) {
+		if (ret != -EAGAIN)
+			pr_err("prepare_auth_request failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_AUTH;
+	return 0;
+
+bad:
+	pr_err("failed to decode hello\n");
+	return -EINVAL;
+}
+
+static int process_auth_bad_method(struct ceph_connection *con,
+				   void *p, void *end)
+{
+	int allowed_protos[8], allowed_modes[8];
+	int allowed_proto_cnt, allowed_mode_cnt;
+	int used_proto, result;
+	int ret;
+	int i;
+
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		con->error_msg = "protocol error, unexpected auth_bad_method";
+		return -EINVAL;
+	}
+
+	ceph_decode_32_safe(&p, end, used_proto, bad);
+	ceph_decode_32_safe(&p, end, result, bad);
+	dout("%s con %p used_proto %d result %d\n", __func__, con, used_proto,
+	     result);
+
+	ceph_decode_32_safe(&p, end, allowed_proto_cnt, bad);
+	if (allowed_proto_cnt > ARRAY_SIZE(allowed_protos)) {
+		pr_err("allowed_protos too big %d\n", allowed_proto_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < allowed_proto_cnt; i++) {
+		ceph_decode_32_safe(&p, end, allowed_protos[i], bad);
+		dout("%s con %p allowed_protos[%d] %d\n", __func__, con,
+		     i, allowed_protos[i]);
+	}
+
+	ceph_decode_32_safe(&p, end, allowed_mode_cnt, bad);
+	if (allowed_mode_cnt > ARRAY_SIZE(allowed_modes)) {
+		pr_err("allowed_modes too big %d\n", allowed_mode_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < allowed_mode_cnt; i++) {
+		ceph_decode_32_safe(&p, end, allowed_modes[i], bad);
+		dout("%s con %p allowed_modes[%d] %d\n", __func__, con,
+		     i, allowed_modes[i]);
+	}
+
+	mutex_unlock(&con->mutex);
+	ret = con->ops->handle_auth_bad_method(con, used_proto, result,
+					       allowed_protos,
+					       allowed_proto_cnt,
+					       allowed_modes,
+					       allowed_mode_cnt);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p handle_auth_bad_method ret %d\n", __func__, con, ret);
+	return ret;
+
+bad:
+	pr_err("failed to decode auth_bad_method\n");
+	return -EINVAL;
+}
+
+static int process_auth_reply_more(struct ceph_connection *con,
+				   void *p, void *end)
+{
+	int payload_len;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		con->error_msg = "protocol error, unexpected auth_reply_more";
+		return -EINVAL;
+	}
+
+	ceph_decode_32_safe(&p, end, payload_len, bad);
+	ceph_decode_need(&p, end, payload_len, bad);
+
+	dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+	reset_out_kvecs(con);
+	ret = prepare_auth_request_more(con, p, payload_len);
+	if (ret) {
+		if (ret != -EAGAIN)
+			pr_err("prepare_auth_request_more failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode auth_reply_more\n");
+	return -EINVAL;
+}
+
+static int process_auth_done(struct ceph_connection *con, void *p, void *end)
+{
+	u8 session_key[CEPH_KEY_LEN];
+	u8 con_secret[CEPH_MAX_CON_SECRET_LEN];
+	int session_key_len, con_secret_len;
+	int payload_len;
+	u64 global_id;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		con->error_msg = "protocol error, unexpected auth_done";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, global_id, bad);
+	ceph_decode_32_safe(&p, end, con->v2.con_mode, bad);
+	ceph_decode_32_safe(&p, end, payload_len, bad);
+
+	dout("%s con %p global_id %llu con_mode %d payload_len %d\n",
+	     __func__, con, global_id, con->v2.con_mode, payload_len);
+
+	mutex_unlock(&con->mutex);
+	session_key_len = 0;
+	con_secret_len = 0;
+	ret = con->ops->handle_auth_done(con, global_id, p, payload_len,
+					 session_key, &session_key_len,
+					 con_secret, &con_secret_len);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_AUTH) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	dout("%s con %p handle_auth_done ret %d\n", __func__, con, ret);
+	if (ret)
+		return ret;
+
+	ret = setup_crypto(con, session_key, session_key_len, con_secret,
+			   con_secret_len);
+	if (ret)
+		return ret;
+
+	reset_out_kvecs(con);
+	ret = prepare_auth_signature(con);
+	if (ret) {
+		pr_err("prepare_auth_signature failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_AUTH_SIGNATURE;
+	return 0;
+
+bad:
+	pr_err("failed to decode auth_done\n");
+	return -EINVAL;
+}
+
+static int process_auth_signature(struct ceph_connection *con,
+				  void *p, void *end)
+{
+	u8 hmac[SHA256_DIGEST_SIZE];
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_AUTH_SIGNATURE) {
+		con->error_msg = "protocol error, unexpected auth_signature";
+		return -EINVAL;
+	}
+
+	ret = hmac_sha256(con, con->v2.out_sign_kvecs,
+			  con->v2.out_sign_kvec_cnt, hmac);
+	if (ret)
+		return ret;
+
+	ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad);
+	if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) {
+		con->error_msg = "integrity error, bad auth signature";
+		return -EBADMSG;
+	}
+
+	dout("%s con %p auth signature ok\n", __func__, con);
+
+	/* no reset_out_kvecs() as our auth_signature may still be pending */
+	if (!con->v2.server_cookie) {
+		ret = prepare_client_ident(con);
+		if (ret) {
+			pr_err("prepare_client_ident failed: %d\n", ret);
+			return ret;
+		}
+
+		con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+	} else {
+		ret = prepare_session_reconnect(con);
+		if (ret) {
+			pr_err("prepare_session_reconnect failed: %d\n", ret);
+			return ret;
+		}
+
+		con->state = CEPH_CON_S_V2_SESSION_RECONNECT;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode auth_signature\n");
+	return -EINVAL;
+}
+
+static int process_server_ident(struct ceph_connection *con,
+				void *p, void *end)
+{
+	struct ceph_client *client = from_msgr(con->msgr);
+	u64 features, required_features;
+	struct ceph_entity_addr addr;
+	u64 global_seq;
+	u64 global_id;
+	u64 cookie;
+	u64 flags;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+		con->error_msg = "protocol error, unexpected server_ident";
+		return -EINVAL;
+	}
+
+	ret = ceph_decode_entity_addrvec(&p, end, true, &addr);
+	if (ret) {
+		pr_err("failed to decode server addrs: %d\n", ret);
+		return ret;
+	}
+
+	ceph_decode_64_safe(&p, end, global_id, bad);
+	ceph_decode_64_safe(&p, end, global_seq, bad);
+	ceph_decode_64_safe(&p, end, features, bad);
+	ceph_decode_64_safe(&p, end, required_features, bad);
+	ceph_decode_64_safe(&p, end, flags, bad);
+	ceph_decode_64_safe(&p, end, cookie, bad);
+
+	dout("%s con %p addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx flags 0x%llx cookie 0x%llx\n",
+	     __func__, con, ceph_pr_addr(&addr), le32_to_cpu(addr.nonce),
+	     global_id, global_seq, features, required_features, flags, cookie);
+
+	/* is this who we intended to talk to? */
+	if (memcmp(&addr, &con->peer_addr, sizeof(con->peer_addr))) {
+		pr_err("bad peer addr/nonce, want %s/%u, got %s/%u\n",
+		       ceph_pr_addr(&con->peer_addr),
+		       le32_to_cpu(con->peer_addr.nonce),
+		       ceph_pr_addr(&addr), le32_to_cpu(addr.nonce));
+		con->error_msg = "wrong peer at address";
+		return -EINVAL;
+	}
+
+	if (client->required_features & ~features) {
+		pr_err("RADOS feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+		       features, client->required_features & ~features);
+		con->error_msg = "missing required protocol features";
+		return -EINVAL;
+	}
+
+	/*
+	 * Both name->type and name->num are set in ceph_con_open() but
+	 * name->num may be bogus in the initial monmap.  name->type is
+	 * verified in handle_hello().
+	 */
+	WARN_ON(!con->peer_name.type);
+	con->peer_name.num = cpu_to_le64(global_id);
+	con->v2.peer_global_seq = global_seq;
+	con->peer_features = features;
+	WARN_ON(required_features & ~client->supported_features);
+	con->v2.server_cookie = cookie;
+
+	if (flags & CEPH_MSG_CONNECT_LOSSY) {
+		ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+		WARN_ON(con->v2.server_cookie);
+	} else {
+		WARN_ON(!con->v2.server_cookie);
+	}
+
+	clear_in_sign_kvecs(con);
+	clear_out_sign_kvecs(con);
+	free_conn_bufs(con);
+	con->delay = 0;  /* reset backoff memory */
+
+	con->state = CEPH_CON_S_OPEN;
+	con->v2.out_state = OUT_S_GET_NEXT;
+	return 0;
+
+bad:
+	pr_err("failed to decode server_ident\n");
+	return -EINVAL;
+}
+
+static int process_ident_missing_features(struct ceph_connection *con,
+					  void *p, void *end)
+{
+	struct ceph_client *client = from_msgr(con->msgr);
+	u64 missing_features;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+		con->error_msg = "protocol error, unexpected ident_missing_features";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, missing_features, bad);
+	pr_err("RADOS feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+	       client->supported_features, missing_features);
+	con->error_msg = "missing required protocol features";
+	return -EINVAL;
+
+bad:
+	pr_err("failed to decode ident_missing_features\n");
+	return -EINVAL;
+}
+
+static int process_session_reconnect_ok(struct ceph_connection *con,
+					void *p, void *end)
+{
+	u64 seq;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_reconnect_ok";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, seq, bad);
+
+	dout("%s con %p seq %llu\n", __func__, con, seq);
+	ceph_con_discard_requeued(con, seq);
+
+	clear_in_sign_kvecs(con);
+	clear_out_sign_kvecs(con);
+	free_conn_bufs(con);
+	con->delay = 0;  /* reset backoff memory */
+
+	con->state = CEPH_CON_S_OPEN;
+	con->v2.out_state = OUT_S_GET_NEXT;
+	return 0;
+
+bad:
+	pr_err("failed to decode session_reconnect_ok\n");
+	return -EINVAL;
+}
+
+static int process_session_retry(struct ceph_connection *con,
+				 void *p, void *end)
+{
+	u64 connect_seq;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_retry";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, connect_seq, bad);
+
+	dout("%s con %p connect_seq %llu\n", __func__, con, connect_seq);
+	WARN_ON(connect_seq <= con->v2.connect_seq);
+	con->v2.connect_seq = connect_seq + 1;
+
+	free_conn_bufs(con);
+
+	reset_out_kvecs(con);
+	ret = prepare_session_reconnect(con);
+	if (ret) {
+		pr_err("prepare_session_reconnect (cseq) failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode session_retry\n");
+	return -EINVAL;
+}
+
+static int process_session_retry_global(struct ceph_connection *con,
+					void *p, void *end)
+{
+	u64 global_seq;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_retry_global";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, global_seq, bad);
+
+	dout("%s con %p global_seq %llu\n", __func__, con, global_seq);
+	WARN_ON(global_seq <= con->v2.global_seq);
+	con->v2.global_seq = ceph_get_global_seq(con->msgr, global_seq);
+
+	free_conn_bufs(con);
+
+	reset_out_kvecs(con);
+	ret = prepare_session_reconnect(con);
+	if (ret) {
+		pr_err("prepare_session_reconnect (gseq) failed: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+
+bad:
+	pr_err("failed to decode session_retry_global\n");
+	return -EINVAL;
+}
+
+static int process_session_reset(struct ceph_connection *con,
+				 void *p, void *end)
+{
+	bool full;
+	int ret;
+
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		con->error_msg = "protocol error, unexpected session_reset";
+		return -EINVAL;
+	}
+
+	ceph_decode_8_safe(&p, end, full, bad);
+	if (!full) {
+		con->error_msg = "protocol error, bad session_reset";
+		return -EINVAL;
+	}
+
+	pr_info("%s%lld %s session reset\n", ENTITY_NAME(con->peer_name),
+		ceph_pr_addr(&con->peer_addr));
+	ceph_con_reset_session(con);
+
+	mutex_unlock(&con->mutex);
+	if (con->ops->peer_reset)
+		con->ops->peer_reset(con);
+	mutex_lock(&con->mutex);
+	if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	free_conn_bufs(con);
+
+	reset_out_kvecs(con);
+	ret = prepare_client_ident(con);
+	if (ret) {
+		pr_err("prepare_client_ident (rst) failed: %d\n", ret);
+		return ret;
+	}
+
+	con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+	return 0;
+
+bad:
+	pr_err("failed to decode session_reset\n");
+	return -EINVAL;
+}
+
+static int process_keepalive2_ack(struct ceph_connection *con,
+				  void *p, void *end)
+{
+	if (con->state != CEPH_CON_S_OPEN) {
+		con->error_msg = "protocol error, unexpected keepalive2_ack";
+		return -EINVAL;
+	}
+
+	ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad);
+	ceph_decode_timespec64(&con->last_keepalive_ack, p);
+
+	dout("%s con %p timestamp %lld.%09ld\n", __func__, con,
+	     con->last_keepalive_ack.tv_sec, con->last_keepalive_ack.tv_nsec);
+
+	return 0;
+
+bad:
+	pr_err("failed to decode keepalive2_ack\n");
+	return -EINVAL;
+}
+
+static int process_ack(struct ceph_connection *con, void *p, void *end)
+{
+	u64 seq;
+
+	if (con->state != CEPH_CON_S_OPEN) {
+		con->error_msg = "protocol error, unexpected ack";
+		return -EINVAL;
+	}
+
+	ceph_decode_64_safe(&p, end, seq, bad);
+
+	dout("%s con %p seq %llu\n", __func__, con, seq);
+	ceph_con_discard_sent(con, seq);
+	return 0;
+
+bad:
+	pr_err("failed to decode ack\n");
+	return -EINVAL;
+}
+
+static int process_control(struct ceph_connection *con, void *p, void *end)
+{
+	int tag = con->v2.in_desc.fd_tag;
+	int ret;
+
+	dout("%s con %p tag %d len %d\n", __func__, con, tag, (int)(end - p));
+
+	switch (tag) {
+	case FRAME_TAG_HELLO:
+		ret = process_hello(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_BAD_METHOD:
+		ret = process_auth_bad_method(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_REPLY_MORE:
+		ret = process_auth_reply_more(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_DONE:
+		ret = process_auth_done(con, p, end);
+		break;
+	case FRAME_TAG_AUTH_SIGNATURE:
+		ret = process_auth_signature(con, p, end);
+		break;
+	case FRAME_TAG_SERVER_IDENT:
+		ret = process_server_ident(con, p, end);
+		break;
+	case FRAME_TAG_IDENT_MISSING_FEATURES:
+		ret = process_ident_missing_features(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RECONNECT_OK:
+		ret = process_session_reconnect_ok(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RETRY:
+		ret = process_session_retry(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RETRY_GLOBAL:
+		ret = process_session_retry_global(con, p, end);
+		break;
+	case FRAME_TAG_SESSION_RESET:
+		ret = process_session_reset(con, p, end);
+		break;
+	case FRAME_TAG_KEEPALIVE2_ACK:
+		ret = process_keepalive2_ack(con, p, end);
+		break;
+	case FRAME_TAG_ACK:
+		ret = process_ack(con, p, end);
+		break;
+	default:
+		pr_err("bad tag %d\n", tag);
+		con->error_msg = "protocol error, bad tag";
+		return -EINVAL;
+	}
+	if (ret) {
+		dout("%s con %p error %d\n", __func__, con, ret);
+		return ret;
+	}
+
+	prepare_read_preamble(con);
+	return 0;
+}
+
+/*
+ * Return:
+ *   1 - con->in_msg set, read message
+ *   0 - skip message
+ *  <0 - error
+ */
+static int process_message_header(struct ceph_connection *con,
+				  void *p, void *end)
+{
+	struct ceph_frame_desc *desc = &con->v2.in_desc;
+	struct ceph_msg_header2 *hdr2 = p;
+	struct ceph_msg_header hdr;
+	int skip;
+	int ret;
+	u64 seq;
+
+	/* verify seq# */
+	seq = le64_to_cpu(hdr2->seq);
+	if ((s64)seq - (s64)con->in_seq < 1) {
+		pr_info("%s%lld %s skipping old message: seq %llu, expected %llu\n",
+			ENTITY_NAME(con->peer_name),
+			ceph_pr_addr(&con->peer_addr),
+			seq, con->in_seq + 1);
+		return 0;
+	}
+	if ((s64)seq - (s64)con->in_seq > 1) {
+		pr_err("bad seq %llu, expected %llu\n", seq, con->in_seq + 1);
+		con->error_msg = "bad message sequence # for incoming message";
+		return -EBADE;
+	}
+
+	ceph_con_discard_sent(con, le64_to_cpu(hdr2->ack_seq));
+
+	fill_header(&hdr, hdr2, desc->fd_lens[1], desc->fd_lens[2],
+		    desc->fd_lens[3], &con->peer_name);
+	ret = ceph_con_in_msg_alloc(con, &hdr, &skip);
+	if (ret)
+		return ret;
+
+	WARN_ON(!con->in_msg ^ skip);
+	if (skip)
+		return 0;
+
+	WARN_ON(!con->in_msg);
+	WARN_ON(con->in_msg->con != con);
+	return 1;
+}
+
+static int process_message(struct ceph_connection *con)
+{
+	ceph_con_process_message(con);
+
+	/*
+	 * We could have been closed by ceph_con_close() because
+	 * ceph_con_process_message() temporarily drops con->mutex.
+	 */
+	if (con->state != CEPH_CON_S_OPEN) {
+		dout("%s con %p state changed to %d\n", __func__, con,
+		     con->state);
+		return -EAGAIN;
+	}
+
+	prepare_read_preamble(con);
+	return 0;
+}
+
+static int __handle_control(struct ceph_connection *con, void *p)
+{
+	void *end = p + con->v2.in_desc.fd_lens[0];
+	struct ceph_msg *msg;
+	int ret;
+
+	if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE)
+		return process_control(con, p, end);
+
+	ret = process_message_header(con, p, end);
+	if (ret < 0)
+		return ret;
+	if (ret == 0) {
+		prepare_skip_message(con);
+		return 0;
+	}
+
+	msg = con->in_msg;  /* set in process_message_header() */
+	if (!front_len(msg) && !middle_len(msg)) {
+		if (!data_len(msg))
+			return process_message(con);
+
+		prepare_read_data(con);
+		return 0;
+	}
+
+	reset_in_kvecs(con);
+	if (front_len(msg)) {
+		WARN_ON(front_len(msg) > msg->front_alloc_len);
+		add_in_kvec(con, msg->front.iov_base, front_len(msg));
+		msg->front.iov_len = front_len(msg);
+
+		if (con_secure(con) && need_padding(front_len(msg)))
+			add_in_kvec(con, FRONT_PAD(con->v2.in_buf),
+				    padding_len(front_len(msg)));
+	} else {
+		msg->front.iov_len = 0;
+	}
+	if (middle_len(msg)) {
+		WARN_ON(middle_len(msg) > msg->middle->alloc_len);
+		add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+		msg->middle->vec.iov_len = middle_len(msg);
+
+		if (con_secure(con) && need_padding(middle_len(msg)))
+			add_in_kvec(con, MIDDLE_PAD(con->v2.in_buf),
+				    padding_len(middle_len(msg)));
+	} else if (msg->middle) {
+		msg->middle->vec.iov_len = 0;
+	}
+
+	if (data_len(msg)) {
+		con->v2.in_state = IN_S_PREPARE_READ_DATA;
+	} else {
+		add_in_kvec(con, con->v2.in_buf,
+			    con_secure(con) ? CEPH_EPILOGUE_SECURE_LEN :
+					      CEPH_EPILOGUE_PLAIN_LEN);
+		con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+	}
+	return 0;
+}
+
+static int handle_preamble(struct ceph_connection *con)
+{
+	struct ceph_frame_desc *desc = &con->v2.in_desc;
+	int ret;
+
+	if (con_secure(con)) {
+		ret = decrypt_preamble(con);
+		if (ret) {
+			if (ret == -EBADMSG)
+				con->error_msg = "integrity error, bad preamble auth tag";
+			return ret;
+		}
+	}
+
+	ret = decode_preamble(con->v2.in_buf, desc);
+	if (ret) {
+		if (ret == -EBADMSG)
+			con->error_msg = "integrity error, bad crc";
+		else
+			con->error_msg = "protocol error, bad preamble";
+		return ret;
+	}
+
+	dout("%s con %p tag %d seg_cnt %d %d+%d+%d+%d\n", __func__,
+	     con, desc->fd_tag, desc->fd_seg_cnt, desc->fd_lens[0],
+	     desc->fd_lens[1], desc->fd_lens[2], desc->fd_lens[3]);
+
+	if (!con_secure(con))
+		return prepare_read_control(con);
+
+	if (desc->fd_lens[0] > CEPH_PREAMBLE_INLINE_LEN)
+		return prepare_read_control_remainder(con);
+
+	return __handle_control(con, CTRL_BODY(con->v2.in_buf));
+}
+
+static int handle_control(struct ceph_connection *con)
+{
+	int ctrl_len = con->v2.in_desc.fd_lens[0];
+	void *buf;
+	int ret;
+
+	WARN_ON(con_secure(con));
+
+	ret = verify_control_crc(con);
+	if (ret) {
+		con->error_msg = "integrity error, bad crc";
+		return ret;
+	}
+
+	if (con->state == CEPH_CON_S_V2_AUTH) {
+		buf = alloc_conn_buf(con, ctrl_len);
+		if (!buf)
+			return -ENOMEM;
+
+		memcpy(buf, con->v2.in_kvecs[0].iov_base, ctrl_len);
+		return __handle_control(con, buf);
+	}
+
+	return __handle_control(con, con->v2.in_kvecs[0].iov_base);
+}
+
+static int handle_control_remainder(struct ceph_connection *con)
+{
+	int ret;
+
+	WARN_ON(!con_secure(con));
+
+	ret = decrypt_control_remainder(con);
+	if (ret) {
+		if (ret == -EBADMSG)
+			con->error_msg = "integrity error, bad control remainder auth tag";
+		return ret;
+	}
+
+	return __handle_control(con, con->v2.in_kvecs[0].iov_base -
+				     CEPH_PREAMBLE_INLINE_LEN);
+}
+
+static int handle_epilogue(struct ceph_connection *con)
+{
+	u32 front_crc, middle_crc, data_crc;
+	int ret;
+
+	if (con_secure(con)) {
+		ret = decrypt_message(con);
+		if (ret) {
+			if (ret == -EBADMSG)
+				con->error_msg = "integrity error, bad epilogue auth tag";
+			return ret;
+		}
+
+		/* just late_status */
+		ret = decode_epilogue(con->v2.in_buf, NULL, NULL, NULL);
+		if (ret) {
+			con->error_msg = "protocol error, bad epilogue";
+			return ret;
+		}
+	} else {
+		ret = decode_epilogue(con->v2.in_buf, &front_crc,
+				      &middle_crc, &data_crc);
+		if (ret) {
+			con->error_msg = "protocol error, bad epilogue";
+			return ret;
+		}
+
+		ret = verify_epilogue_crcs(con, front_crc, middle_crc,
+					   data_crc);
+		if (ret) {
+			con->error_msg = "integrity error, bad crc";
+			return ret;
+		}
+	}
+
+	return process_message(con);
+}
+
+static void finish_skip(struct ceph_connection *con)
+{
+	dout("%s con %p\n", __func__, con);
+
+	if (con_secure(con))
+		gcm_inc_nonce(&con->v2.in_gcm_nonce);
+
+	__finish_skip(con);
+}
+
+static int populate_in_iter(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d in_state %d\n", __func__, con, con->state,
+	     con->v2.in_state);
+	WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+	if (con->state == CEPH_CON_S_V2_BANNER_PREFIX) {
+		ret = process_banner_prefix(con);
+	} else if (con->state == CEPH_CON_S_V2_BANNER_PAYLOAD) {
+		ret = process_banner_payload(con);
+	} else if ((con->state >= CEPH_CON_S_V2_HELLO &&
+		    con->state <= CEPH_CON_S_V2_SESSION_RECONNECT) ||
+		   con->state == CEPH_CON_S_OPEN) {
+		switch (con->v2.in_state) {
+		case IN_S_HANDLE_PREAMBLE:
+			ret = handle_preamble(con);
+			break;
+		case IN_S_HANDLE_CONTROL:
+			ret = handle_control(con);
+			break;
+		case IN_S_HANDLE_CONTROL_REMAINDER:
+			ret = handle_control_remainder(con);
+			break;
+		case IN_S_PREPARE_READ_DATA:
+			prepare_read_data(con);
+			ret = 0;
+			break;
+		case IN_S_PREPARE_READ_DATA_CONT:
+			prepare_read_data_cont(con);
+			ret = 0;
+			break;
+		case IN_S_HANDLE_EPILOGUE:
+			ret = handle_epilogue(con);
+			break;
+		case IN_S_FINISH_SKIP:
+			finish_skip(con);
+			ret = 0;
+			break;
+		default:
+			WARN(1, "bad in_state %d", con->v2.in_state);
+			return -EINVAL;
+		}
+	} else {
+		WARN(1, "bad state %d", con->state);
+		return -EINVAL;
+	}
+	if (ret) {
+		dout("%s con %p error %d\n", __func__, con, ret);
+		return ret;
+	}
+
+	if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+		return -ENODATA;
+	dout("%s con %p populated %zu\n", __func__, con,
+	     iov_iter_count(&con->v2.in_iter));
+	return 1;
+}
+
+int ceph_con_v2_try_read(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d need %zu\n", __func__, con, con->state,
+	     iov_iter_count(&con->v2.in_iter));
+
+	if (con->state == CEPH_CON_S_PREOPEN)
+		return 0;
+
+	/*
+	 * We should always have something pending here.  If not,
+	 * avoid calling populate_in_iter() as if we read something
+	 * (ceph_tcp_recv() would immediately return 1).
+	 */
+	if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+		return -ENODATA;
+
+	for (;;) {
+		ret = ceph_tcp_recv(con);
+		if (ret <= 0)
+			return ret;
+
+		ret = populate_in_iter(con);
+		if (ret <= 0) {
+			if (ret && ret != -EAGAIN && !con->error_msg)
+				con->error_msg = "read processing error";
+			return ret;
+		}
+	}
+}
+
+static void queue_data(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	con->v2.out_epil.data_crc = -1;
+	ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg,
+				  data_len(con->out_msg));
+
+	get_bvec_at(&con->v2.out_cursor, &bv);
+	set_out_bvec(con, &bv, true);
+	con->v2.out_state = OUT_S_QUEUE_DATA_CONT;
+}
+
+static void queue_data_cont(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	con->v2.out_epil.data_crc = ceph_crc32c_page(
+		con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+		con->v2.out_bvec.bv_offset, con->v2.out_bvec.bv_len);
+
+	ceph_msg_data_advance(&con->v2.out_cursor, con->v2.out_bvec.bv_len);
+	if (con->v2.out_cursor.total_resid) {
+		get_bvec_at(&con->v2.out_cursor, &bv);
+		set_out_bvec(con, &bv, true);
+		WARN_ON(con->v2.out_state != OUT_S_QUEUE_DATA_CONT);
+		return;
+	}
+
+	/*
+	 * We've written all data.  Queue epilogue.  Once it's written,
+	 * we are done.
+	 */
+	reset_out_kvecs(con);
+	prepare_epilogue_plain(con, false);
+	con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_enc_page(struct ceph_connection *con)
+{
+	struct bio_vec bv;
+
+	dout("%s con %p i %d resid %d\n", __func__, con, con->v2.out_enc_i,
+	     con->v2.out_enc_resid);
+	WARN_ON(!con->v2.out_enc_resid);
+
+	bv.bv_page = con->v2.out_enc_pages[con->v2.out_enc_i];
+	bv.bv_offset = 0;
+	bv.bv_len = min(con->v2.out_enc_resid, (int)PAGE_SIZE);
+
+	set_out_bvec(con, &bv, false);
+	con->v2.out_enc_i++;
+	con->v2.out_enc_resid -= bv.bv_len;
+
+	if (con->v2.out_enc_resid) {
+		WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE);
+		return;
+	}
+
+	/*
+	 * We've queued the last piece of ciphertext (ending with
+	 * epilogue) + auth tag.  Once it's written, we are done.
+	 */
+	WARN_ON(con->v2.out_enc_i != con->v2.out_enc_page_cnt);
+	con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_zeros(struct ceph_connection *con)
+{
+	dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero);
+
+	if (con->v2.out_zero) {
+		set_out_bvec_zero(con);
+		con->v2.out_zero -= con->v2.out_bvec.bv_len;
+		con->v2.out_state = OUT_S_QUEUE_ZEROS;
+		return;
+	}
+
+	/*
+	 * We've zero-filled everything up to epilogue.  Queue epilogue
+	 * with late_status set to ABORTED and crcs adjusted for zeros.
+	 * Once it's written, we are done patching up for the revoke.
+	 */
+	reset_out_kvecs(con);
+	prepare_epilogue_plain(con, true);
+	con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void finish_message(struct ceph_connection *con)
+{
+	dout("%s con %p msg %p\n", __func__, con, con->out_msg);
+
+	/* we end up here both plain and secure modes */
+	if (con->v2.out_enc_pages) {
+		WARN_ON(!con->v2.out_enc_page_cnt);
+		ceph_release_page_vector(con->v2.out_enc_pages,
+					 con->v2.out_enc_page_cnt);
+		con->v2.out_enc_pages = NULL;
+		con->v2.out_enc_page_cnt = 0;
+	}
+	/* message may have been revoked */
+	if (con->out_msg) {
+		ceph_msg_put(con->out_msg);
+		con->out_msg = NULL;
+	}
+
+	con->v2.out_state = OUT_S_GET_NEXT;
+}
+
+static int populate_out_iter(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d out_state %d\n", __func__, con, con->state,
+	     con->v2.out_state);
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+
+	if (con->state != CEPH_CON_S_OPEN) {
+		WARN_ON(con->state < CEPH_CON_S_V2_BANNER_PREFIX ||
+			con->state > CEPH_CON_S_V2_SESSION_RECONNECT);
+		goto nothing_pending;
+	}
+
+	switch (con->v2.out_state) {
+	case OUT_S_QUEUE_DATA:
+		WARN_ON(!con->out_msg);
+		queue_data(con);
+		goto populated;
+	case OUT_S_QUEUE_DATA_CONT:
+		WARN_ON(!con->out_msg);
+		queue_data_cont(con);
+		goto populated;
+	case OUT_S_QUEUE_ENC_PAGE:
+		queue_enc_page(con);
+		goto populated;
+	case OUT_S_QUEUE_ZEROS:
+		WARN_ON(con->out_msg);  /* revoked */
+		queue_zeros(con);
+		goto populated;
+	case OUT_S_FINISH_MESSAGE:
+		finish_message(con);
+		break;
+	case OUT_S_GET_NEXT:
+		break;
+	default:
+		WARN(1, "bad out_state %d", con->v2.out_state);
+		return -EINVAL;
+	}
+
+	WARN_ON(con->v2.out_state != OUT_S_GET_NEXT);
+	if (ceph_con_flag_test_and_clear(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
+		ret = prepare_keepalive2(con);
+		if (ret) {
+			pr_err("prepare_keepalive2 failed: %d\n", ret);
+			return ret;
+		}
+	} else if (!list_empty(&con->out_queue)) {
+		ceph_con_get_out_msg(con);
+		ret = prepare_message(con);
+		if (ret) {
+			pr_err("prepare_message failed: %d\n", ret);
+			return ret;
+		}
+	} else if (con->in_seq > con->in_seq_acked) {
+		ret = prepare_ack(con);
+		if (ret) {
+			pr_err("prepare_ack failed: %d\n", ret);
+			return ret;
+		}
+	} else {
+		goto nothing_pending;
+	}
+
+populated:
+	if (WARN_ON(!iov_iter_count(&con->v2.out_iter)))
+		return -ENODATA;
+	dout("%s con %p populated %zu\n", __func__, con,
+	     iov_iter_count(&con->v2.out_iter));
+	return 1;
+
+nothing_pending:
+	WARN_ON(iov_iter_count(&con->v2.out_iter));
+	dout("%s con %p nothing pending\n", __func__, con);
+	ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+	return 0;
+}
+
+int ceph_con_v2_try_write(struct ceph_connection *con)
+{
+	int ret;
+
+	dout("%s con %p state %d have %zu\n", __func__, con, con->state,
+	     iov_iter_count(&con->v2.out_iter));
+
+	/* open the socket first? */
+	if (con->state == CEPH_CON_S_PREOPEN) {
+		WARN_ON(con->peer_addr.type != CEPH_ENTITY_ADDR_TYPE_MSGR2);
+
+		/*
+		 * Always bump global_seq.  Bump connect_seq only if
+		 * there is a session (i.e. we are reconnecting and will
+		 * send session_reconnect instead of client_ident).
+		 */
+		con->v2.global_seq = ceph_get_global_seq(con->msgr, 0);
+		if (con->v2.server_cookie)
+			con->v2.connect_seq++;
+
+		ret = prepare_read_banner_prefix(con);
+		if (ret) {
+			pr_err("prepare_read_banner_prefix failed: %d\n", ret);
+			con->error_msg = "connect error";
+			return ret;
+		}
+
+		reset_out_kvecs(con);
+		ret = prepare_banner(con);
+		if (ret) {
+			pr_err("prepare_banner failed: %d\n", ret);
+			con->error_msg = "connect error";
+			return ret;
+		}
+
+		ret = ceph_tcp_connect(con);
+		if (ret) {
+			pr_err("ceph_tcp_connect failed: %d\n", ret);
+			con->error_msg = "connect error";
+			return ret;
+		}
+	}
+
+	if (!iov_iter_count(&con->v2.out_iter)) {
+		ret = populate_out_iter(con);
+		if (ret <= 0) {
+			if (ret && ret != -EAGAIN && !con->error_msg)
+				con->error_msg = "write processing error";
+			return ret;
+		}
+	}
+
+	tcp_sock_set_cork(con->sock->sk, true);
+	for (;;) {
+		ret = ceph_tcp_send(con);
+		if (ret <= 0)
+			break;
+
+		ret = populate_out_iter(con);
+		if (ret <= 0) {
+			if (ret && ret != -EAGAIN && !con->error_msg)
+				con->error_msg = "write processing error";
+			break;
+		}
+	}
+
+	tcp_sock_set_cork(con->sock->sk, false);
+	return ret;
+}
+
+static u32 crc32c_zeros(u32 crc, int zero_len)
+{
+	int len;
+
+	while (zero_len) {
+		len = min(zero_len, (int)PAGE_SIZE);
+		crc = crc32c(crc, page_address(ceph_zero_page), len);
+		zero_len -= len;
+	}
+
+	return crc;
+}
+
+static void prepare_zero_front(struct ceph_connection *con, int resid)
+{
+	int sent;
+
+	WARN_ON(!resid || resid > front_len(con->out_msg));
+	sent = front_len(con->out_msg) - resid;
+	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+	if (sent) {
+		con->v2.out_epil.front_crc =
+			crc32c(-1, con->out_msg->front.iov_base, sent);
+		con->v2.out_epil.front_crc =
+			crc32c_zeros(con->v2.out_epil.front_crc, resid);
+	} else {
+		con->v2.out_epil.front_crc = crc32c_zeros(-1, resid);
+	}
+
+	con->v2.out_iter.count -= resid;
+	out_zero_add(con, resid);
+}
+
+static void prepare_zero_middle(struct ceph_connection *con, int resid)
+{
+	int sent;
+
+	WARN_ON(!resid || resid > middle_len(con->out_msg));
+	sent = middle_len(con->out_msg) - resid;
+	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+	if (sent) {
+		con->v2.out_epil.middle_crc =
+			crc32c(-1, con->out_msg->middle->vec.iov_base, sent);
+		con->v2.out_epil.middle_crc =
+			crc32c_zeros(con->v2.out_epil.middle_crc, resid);
+	} else {
+		con->v2.out_epil.middle_crc = crc32c_zeros(-1, resid);
+	}
+
+	con->v2.out_iter.count -= resid;
+	out_zero_add(con, resid);
+}
+
+static void prepare_zero_data(struct ceph_connection *con)
+{
+	dout("%s con %p\n", __func__, con);
+	con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg));
+	out_zero_add(con, data_len(con->out_msg));
+}
+
+static void revoke_at_queue_data(struct ceph_connection *con)
+{
+	int boundary;
+	int resid;
+
+	WARN_ON(!data_len(con->out_msg));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+	resid = iov_iter_count(&con->v2.out_iter);
+
+	boundary = front_len(con->out_msg) + middle_len(con->out_msg);
+	if (resid > boundary) {
+		resid -= boundary;
+		WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+		dout("%s con %p was sending head\n", __func__, con);
+		if (front_len(con->out_msg))
+			prepare_zero_front(con, front_len(con->out_msg));
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		prepare_zero_data(con);
+		WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+		con->v2.out_state = OUT_S_QUEUE_ZEROS;
+		return;
+	}
+
+	boundary = middle_len(con->out_msg);
+	if (resid > boundary) {
+		resid -= boundary;
+		dout("%s con %p was sending front\n", __func__, con);
+		prepare_zero_front(con, resid);
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		prepare_zero_data(con);
+		queue_zeros(con);
+		return;
+	}
+
+	WARN_ON(!resid);
+	dout("%s con %p was sending middle\n", __func__, con);
+	prepare_zero_middle(con, resid);
+	prepare_zero_data(con);
+	queue_zeros(con);
+}
+
+static void revoke_at_queue_data_cont(struct ceph_connection *con)
+{
+	int sent, resid;  /* current piece of data */
+
+	WARN_ON(!data_len(con->out_msg));
+	WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter));
+	resid = iov_iter_count(&con->v2.out_iter);
+	WARN_ON(!resid || resid > con->v2.out_bvec.bv_len);
+	sent = con->v2.out_bvec.bv_len - resid;
+	dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+	if (sent) {
+		con->v2.out_epil.data_crc = ceph_crc32c_page(
+			con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+			con->v2.out_bvec.bv_offset, sent);
+		ceph_msg_data_advance(&con->v2.out_cursor, sent);
+	}
+	WARN_ON(resid > con->v2.out_cursor.total_resid);
+	con->v2.out_epil.data_crc = crc32c_zeros(con->v2.out_epil.data_crc,
+						con->v2.out_cursor.total_resid);
+
+	con->v2.out_iter.count -= resid;
+	out_zero_add(con, con->v2.out_cursor.total_resid);
+	queue_zeros(con);
+}
+
+static void revoke_at_finish_message(struct ceph_connection *con)
+{
+	int boundary;
+	int resid;
+
+	WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+	resid = iov_iter_count(&con->v2.out_iter);
+
+	if (!front_len(con->out_msg) && !middle_len(con->out_msg) &&
+	    !data_len(con->out_msg)) {
+		WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN);
+		dout("%s con %p was sending head (empty message) - noop\n",
+		     __func__, con);
+		return;
+	}
+
+	boundary = front_len(con->out_msg) + middle_len(con->out_msg) +
+		   CEPH_EPILOGUE_PLAIN_LEN;
+	if (resid > boundary) {
+		resid -= boundary;
+		WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+		dout("%s con %p was sending head\n", __func__, con);
+		if (front_len(con->out_msg))
+			prepare_zero_front(con, front_len(con->out_msg));
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+		WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+		con->v2.out_state = OUT_S_QUEUE_ZEROS;
+		return;
+	}
+
+	boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+	if (resid > boundary) {
+		resid -= boundary;
+		dout("%s con %p was sending front\n", __func__, con);
+		prepare_zero_front(con, resid);
+		if (middle_len(con->out_msg))
+			prepare_zero_middle(con, middle_len(con->out_msg));
+		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+		queue_zeros(con);
+		return;
+	}
+
+	boundary = CEPH_EPILOGUE_PLAIN_LEN;
+	if (resid > boundary) {
+		resid -= boundary;
+		dout("%s con %p was sending middle\n", __func__, con);
+		prepare_zero_middle(con, resid);
+		con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+		queue_zeros(con);
+		return;
+	}
+
+	WARN_ON(!resid);
+	dout("%s con %p was sending epilogue - noop\n", __func__, con);
+}
+
+void ceph_con_v2_revoke(struct ceph_connection *con)
+{
+	WARN_ON(con->v2.out_zero);
+
+	if (con_secure(con)) {
+		WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE &&
+			con->v2.out_state != OUT_S_FINISH_MESSAGE);
+		dout("%s con %p secure - noop\n", __func__, con);
+		return;
+	}
+
+	switch (con->v2.out_state) {
+	case OUT_S_QUEUE_DATA:
+		revoke_at_queue_data(con);
+		break;
+	case OUT_S_QUEUE_DATA_CONT:
+		revoke_at_queue_data_cont(con);
+		break;
+	case OUT_S_FINISH_MESSAGE:
+		revoke_at_finish_message(con);
+		break;
+	default:
+		WARN(1, "bad out_state %d", con->v2.out_state);
+		break;
+	}
+}
+
+static void revoke_at_prepare_read_data(struct ceph_connection *con)
+{
+	int remaining;  /* data + [data padding] + epilogue */
+	int resid;
+
+	WARN_ON(!data_len(con->in_msg));
+	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid);
+
+	if (con_secure(con))
+		remaining = padded_len(data_len(con->in_msg)) +
+			    CEPH_EPILOGUE_SECURE_LEN;
+	else
+		remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+
+	dout("%s con %p resid %d remaining %d\n", __func__, con, resid,
+	     remaining);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, resid + remaining);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_prepare_read_data_cont(struct ceph_connection *con)
+{
+	int recved, resid;  /* current piece of data */
+	int remaining;  /* [data padding] + epilogue */
+
+	WARN_ON(!data_len(con->in_msg));
+	WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
+	recved = con->v2.in_bvec.bv_len - resid;
+	dout("%s con %p recved %d resid %d\n", __func__, con, recved, resid);
+
+	if (recved)
+		ceph_msg_data_advance(&con->v2.in_cursor, recved);
+	WARN_ON(resid > con->v2.in_cursor.total_resid);
+
+	if (con_secure(con))
+		remaining = padding_len(data_len(con->in_msg)) +
+			    CEPH_EPILOGUE_SECURE_LEN;
+	else
+		remaining = CEPH_EPILOGUE_PLAIN_LEN;
+
+	dout("%s con %p total_resid %zu remaining %d\n", __func__, con,
+	     con->v2.in_cursor.total_resid, remaining);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, con->v2.in_cursor.total_resid + remaining);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_handle_epilogue(struct ceph_connection *con)
+{
+	int resid;
+
+	WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+	resid = iov_iter_count(&con->v2.in_iter);
+	WARN_ON(!resid);
+
+	dout("%s con %p resid %d\n", __func__, con, resid);
+	con->v2.in_iter.count -= resid;
+	set_in_skip(con, resid);
+	con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
+{
+	switch (con->v2.in_state) {
+	case IN_S_PREPARE_READ_DATA:
+		revoke_at_prepare_read_data(con);
+		break;
+	case IN_S_PREPARE_READ_DATA_CONT:
+		revoke_at_prepare_read_data_cont(con);
+		break;
+	case IN_S_HANDLE_EPILOGUE:
+		revoke_at_handle_epilogue(con);
+		break;
+	default:
+		WARN(1, "bad in_state %d", con->v2.in_state);
+		break;
+	}
+}
+
+bool ceph_con_v2_opened(struct ceph_connection *con)
+{
+	return con->v2.peer_global_seq;
+}
+
+void ceph_con_v2_reset_session(struct ceph_connection *con)
+{
+	con->v2.client_cookie = 0;
+	con->v2.server_cookie = 0;
+	con->v2.global_seq = 0;
+	con->v2.connect_seq = 0;
+	con->v2.peer_global_seq = 0;
+}
+
+void ceph_con_v2_reset_protocol(struct ceph_connection *con)
+{
+	iov_iter_truncate(&con->v2.in_iter, 0);
+	iov_iter_truncate(&con->v2.out_iter, 0);
+	con->v2.out_zero = 0;
+
+	clear_in_sign_kvecs(con);
+	clear_out_sign_kvecs(con);
+	free_conn_bufs(con);
+
+	if (con->v2.out_enc_pages) {
+		WARN_ON(!con->v2.out_enc_page_cnt);
+		ceph_release_page_vector(con->v2.out_enc_pages,
+					 con->v2.out_enc_page_cnt);
+		con->v2.out_enc_pages = NULL;
+		con->v2.out_enc_page_cnt = 0;
+	}
+
+	con->v2.con_mode = CEPH_CON_MODE_UNKNOWN;
+
+	if (con->v2.hmac_tfm) {
+		crypto_free_shash(con->v2.hmac_tfm);
+		con->v2.hmac_tfm = NULL;
+	}
+	if (con->v2.gcm_req) {
+		aead_request_free(con->v2.gcm_req);
+		con->v2.gcm_req = NULL;
+	}
+	if (con->v2.gcm_tfm) {
+		crypto_free_aead(con->v2.gcm_tfm);
+		con->v2.gcm_tfm = NULL;
+	}
+}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 792a8c4164d7..b9d54ed9f338 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -257,10 +257,16 @@ static void __open_session(struct ceph_mon_client *monc)
 		      &monc->monmap->mon_inst[monc->cur_mon].addr);
 
 	/*
-	 * send an initial keepalive to ensure our timestamp is valid
-	 * by the time we are in an OPENED state
+	 * Queue a keepalive to ensure that in case of an early fault
+	 * the messenger doesn't put us into STANDBY state and instead
+	 * retries.  This also ensures that our timestamp is valid by
+	 * the time we finish hunting and delayed_work() checks it.
 	 */
 	ceph_con_keepalive(&monc->con);
+	if (ceph_msgr2(monc->client)) {
+		monc->pending_auth = 1;
+		return;
+	}
 
 	/* initiate authentication handshake */
 	ret = ceph_auth_build_hello(monc->auth,
@@ -543,7 +549,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	p = msg->front.iov_base;
 	end = p + msg->front.iov_len;
 
-	monmap = ceph_monmap_decode(&p, end, false);
+	monmap = ceph_monmap_decode(&p, end, ceph_msgr2(client));
 	if (IS_ERR(monmap)) {
 		pr_err("problem decoding monmap, %d\n",
 		       (int)PTR_ERR(monmap));
@@ -1119,8 +1125,9 @@ static void delayed_work(struct work_struct *work)
  */
 static int build_initial_monmap(struct ceph_mon_client *monc)
 {
+	__le32 my_type = ceph_msgr2(monc->client) ?
+		CEPH_ENTITY_ADDR_TYPE_MSGR2 : CEPH_ENTITY_ADDR_TYPE_LEGACY;
 	struct ceph_options *opt = monc->client->options;
-	struct ceph_entity_addr *mon_addr = opt->mon_addr;
 	int num_mon = opt->num_mon;
 	int i;
 
@@ -1129,12 +1136,16 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
 			       GFP_KERNEL);
 	if (!monc->monmap)
 		return -ENOMEM;
+
 	for (i = 0; i < num_mon; i++) {
-		monc->monmap->mon_inst[i].addr = mon_addr[i];
-		monc->monmap->mon_inst[i].addr.nonce = 0;
-		monc->monmap->mon_inst[i].name.type =
-			CEPH_ENTITY_TYPE_MON;
-		monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+		struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i];
+
+		memcpy(&inst->addr.in_addr, &opt->mon_addr[i].in_addr,
+		       sizeof(inst->addr.in_addr));
+		inst->addr.type = my_type;
+		inst->addr.nonce = 0;
+		inst->name.type = CEPH_ENTITY_TYPE_MON;
+		inst->name.num = cpu_to_le64(i);
 	}
 	monc->monmap->num_mon = num_mon;
 	return 0;
@@ -1337,6 +1348,88 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc)
 }
 EXPORT_SYMBOL(ceph_monc_validate_auth);
 
+static int mon_get_auth_request(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len)
+{
+	struct ceph_mon_client *monc = con->private;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = ceph_auth_get_request(monc->auth, buf, *buf_len);
+	mutex_unlock(&monc->mutex);
+	if (ret < 0)
+		return ret;
+
+	*buf_len = ret;
+	*authorizer = NULL;
+	*authorizer_len = 0;
+	return 0;
+}
+
+static int mon_handle_auth_reply_more(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len)
+{
+	struct ceph_mon_client *monc = con->private;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	ret = ceph_auth_handle_reply_more(monc->auth, reply, reply_len,
+					  buf, *buf_len);
+	mutex_unlock(&monc->mutex);
+	if (ret < 0)
+		return ret;
+
+	*buf_len = ret;
+	*authorizer = NULL;
+	*authorizer_len = 0;
+	return 0;
+}
+
+static int mon_handle_auth_done(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_mon_client *monc = con->private;
+	bool was_authed;
+	int ret;
+
+	mutex_lock(&monc->mutex);
+	WARN_ON(!monc->hunting);
+	was_authed = ceph_auth_is_authenticated(monc->auth);
+	ret = ceph_auth_handle_reply_done(monc->auth, global_id,
+					  reply, reply_len,
+					  session_key, session_key_len,
+					  con_secret, con_secret_len);
+	finish_auth(monc, ret, was_authed);
+	if (!ret)
+		finish_hunting(monc);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
+static int mon_handle_auth_bad_method(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt)
+{
+	struct ceph_mon_client *monc = con->private;
+	bool was_authed;
+
+	mutex_lock(&monc->mutex);
+	WARN_ON(!monc->hunting);
+	was_authed = ceph_auth_is_authenticated(monc->auth);
+	ceph_auth_handle_bad_method(monc->auth, used_proto, result,
+				    allowed_protos, proto_cnt,
+				    allowed_modes, mode_cnt);
+	finish_auth(monc, -EACCES, was_authed);
+	mutex_unlock(&monc->mutex);
+	return 0;
+}
+
 /*
  * handle incoming message
  */
@@ -1487,4 +1580,8 @@ static const struct ceph_connection_operations mon_con_ops = {
 	.dispatch = dispatch,
 	.fault = mon_fault,
 	.alloc_msg = mon_alloc_msg,
+	.get_auth_request = mon_get_auth_request,
+	.handle_auth_reply_more = mon_handle_auth_reply_more,
+	.handle_auth_done = mon_handle_auth_done,
+	.handle_auth_bad_method = mon_handle_auth_bad_method,
 };
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 51be5a7482fc..662b52e52651 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -3918,9 +3918,11 @@ static int handle_one_map(struct ceph_osd_client *osdc,
 	set_pool_was_full(osdc);
 
 	if (incremental)
-		newmap = osdmap_apply_incremental(&p, end, false, osdc->osdmap);
+		newmap = osdmap_apply_incremental(&p, end,
+						  ceph_msgr2(osdc->client),
+						  osdc->osdmap);
 	else
-		newmap = ceph_osdmap_decode(&p, end, false);
+		newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client));
 	if (IS_ERR(newmap))
 		return PTR_ERR(newmap);
 
@@ -5575,6 +5577,7 @@ static void put_osd_con(struct ceph_connection *con)
 /*
  * authentication
  */
+
 /*
  * Note: returned pointer is the address of a structure that's
  * managed separately.  Caller must *not* attempt to free it.
@@ -5640,6 +5643,80 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	return ceph_monc_validate_auth(&osdc->client->monc);
 }
 
+static int osd_get_auth_request(struct ceph_connection *con,
+				void *buf, int *buf_len,
+				void **authorizer, int *authorizer_len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
+	int ret;
+
+	ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+				       buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int osd_handle_auth_reply_more(struct ceph_connection *con,
+				      void *reply, int reply_len,
+				      void *buf, int *buf_len,
+				      void **authorizer, int *authorizer_len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
+	int ret;
+
+	ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+					      buf, buf_len);
+	if (ret)
+		return ret;
+
+	*authorizer = auth->authorizer_buf;
+	*authorizer_len = auth->authorizer_buf_len;
+	return 0;
+}
+
+static int osd_handle_auth_done(struct ceph_connection *con,
+				u64 global_id, void *reply, int reply_len,
+				u8 *session_key, int *session_key_len,
+				u8 *con_secret, int *con_secret_len)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+	struct ceph_auth_handshake *auth = &o->o_auth;
+
+	return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+					       session_key, session_key_len,
+					       con_secret, con_secret_len);
+}
+
+static int osd_handle_auth_bad_method(struct ceph_connection *con,
+				      int used_proto, int result,
+				      const int *allowed_protos, int proto_cnt,
+				      const int *allowed_modes, int mode_cnt)
+{
+	struct ceph_osd *o = con->private;
+	struct ceph_mon_client *monc = &o->o_osdc->client->monc;
+	int ret;
+
+	if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD,
+					    used_proto, result,
+					    allowed_protos, proto_cnt,
+					    allowed_modes, mode_cnt)) {
+		ret = ceph_monc_validate_auth(monc);
+		if (ret)
+			return ret;
+	}
+
+	return -EACCES;
+}
+
 static void osd_reencode_message(struct ceph_msg *msg)
 {
 	int type = le16_to_cpu(msg->hdr.type);
@@ -5677,4 +5754,8 @@ static const struct ceph_connection_operations osd_con_ops = {
 	.sign_message = osd_sign_message,
 	.check_message_signature = osd_check_message_signature,
 	.fault = osd_fault,
+	.get_auth_request = osd_get_auth_request,
+	.handle_auth_reply_more = osd_handle_auth_reply_more,
+	.handle_auth_done = osd_handle_auth_done,
+	.handle_auth_bad_method = osd_handle_auth_bad_method,
 };
-- 
cgit v1.2.3


From 2f0df6cfa325d7106b8a65bc0e02db1086e3f73b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 19 Nov 2020 20:00:10 +0100
Subject: libceph: drop ceph_auth_{create,update}_authorizer()

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/auth.h |  6 ------
 net/ceph/auth.c           | 28 ----------------------------
 2 files changed, 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 3fbe72ebd779..71b5d481c653 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -126,13 +126,7 @@ int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
 			       struct ceph_auth_handshake *auth,
 			       int peer_type, bool force_new,
 			       int *proto, int *pref_mode, int *fallb_mode);
-extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
-				       int peer_type,
-				       struct ceph_auth_handshake *auth);
 void ceph_auth_destroy_authorizer(struct ceph_authorizer *a);
-extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
-				       int peer_type,
-				       struct ceph_auth_handshake *a);
 int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a,
 				       void *challenge_buf,
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index 6b315c8212b1..eb261aa5fe18 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -326,40 +326,12 @@ out:
 }
 EXPORT_SYMBOL(__ceph_auth_get_authorizer);
 
-int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
-				int peer_type,
-				struct ceph_auth_handshake *auth)
-{
-	int ret = 0;
-
-	mutex_lock(&ac->mutex);
-	if (ac->ops && ac->ops->create_authorizer)
-		ret = ac->ops->create_authorizer(ac, peer_type, auth);
-	mutex_unlock(&ac->mutex);
-	return ret;
-}
-EXPORT_SYMBOL(ceph_auth_create_authorizer);
-
 void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
 {
 	a->destroy(a);
 }
 EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
 
-int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
-				int peer_type,
-				struct ceph_auth_handshake *a)
-{
-	int ret = 0;
-
-	mutex_lock(&ac->mutex);
-	if (ac->ops && ac->ops->update_authorizer)
-		ret = ac->ops->update_authorizer(ac, peer_type, a);
-	mutex_unlock(&ac->mutex);
-	return ret;
-}
-EXPORT_SYMBOL(ceph_auth_update_authorizer);
-
 int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a,
 				       void *challenge_buf,
-- 
cgit v1.2.3


From 1b04fa9900263b4e217ca2509fd778b32c2b4eb2 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Wed, 9 Dec 2020 21:27:31 +0100
Subject: rcu-tasks: Move RCU-tasks initialization to before early_initcall()

PowerPC testing encountered boot failures due to RCU Tasks not being
fully initialized until core_initcall() time.  This commit therefore
initializes RCU Tasks (along with Rude RCU and RCU Tasks Trace) just
before early_initcall() time, thus allowing waiting on RCU Tasks grace
periods from early_initcall() handlers.

Link: https://lore.kernel.org/rcu/87eekfh80a.fsf@dja-thinkpad.axtens.net/
Fixes: 36dadef23fcc ("kprobes: Init kprobes in early_initcall")
Tested-by: Daniel Axtens <dja@axtens.net>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h |  6 ++++++
 init/main.c              |  1 +
 kernel/rcu/tasks.h       | 25 +++++++++++++++++++++----
 3 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 6cdd0152c253..5c119d6cecf1 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -86,6 +86,12 @@ void rcu_sched_clock_irq(int user);
 void rcu_report_dead(unsigned int cpu);
 void rcutree_migrate_callbacks(int cpu);
 
+#ifdef CONFIG_TASKS_RCU_GENERIC
+void rcu_init_tasks_generic(void);
+#else
+static inline void rcu_init_tasks_generic(void) { }
+#endif
+
 #ifdef CONFIG_RCU_STALL_COMMON
 void rcu_sysrq_start(void);
 void rcu_sysrq_end(void);
diff --git a/init/main.c b/init/main.c
index 32b2a8affafd..9d964511fe0c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1512,6 +1512,7 @@ static noinline void __init kernel_init_freeable(void)
 
 	init_mm_internals();
 
+	rcu_init_tasks_generic();
 	do_pre_smp_initcalls();
 	lockup_detector_init();
 
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index d5d9f2d03e8a..73bbe792fe1e 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -241,7 +241,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 	}
 }
 
-/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */
+/* Spawn RCU-tasks grace-period kthread. */
 static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
 {
 	struct task_struct *t;
@@ -569,7 +569,6 @@ static int __init rcu_spawn_tasks_kthread(void)
 	rcu_spawn_tasks_kthread_generic(&rcu_tasks);
 	return 0;
 }
-core_initcall(rcu_spawn_tasks_kthread);
 
 #ifndef CONFIG_TINY_RCU
 static void show_rcu_tasks_classic_gp_kthread(void)
@@ -697,7 +696,6 @@ static int __init rcu_spawn_tasks_rude_kthread(void)
 	rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
 	return 0;
 }
-core_initcall(rcu_spawn_tasks_rude_kthread);
 
 #ifndef CONFIG_TINY_RCU
 static void show_rcu_tasks_rude_gp_kthread(void)
@@ -975,6 +973,11 @@ static void rcu_tasks_trace_pregp_step(void)
 static void rcu_tasks_trace_pertask(struct task_struct *t,
 				    struct list_head *hop)
 {
+	// During early boot when there is only the one boot CPU, there
+	// is no idle task for the other CPUs. Just return.
+	if (unlikely(t == NULL))
+		return;
+
 	WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
 	WRITE_ONCE(t->trc_reader_checked, false);
 	t->trc_ipi_to_cpu = -1;
@@ -1200,7 +1203,6 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
 	rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace);
 	return 0;
 }
-core_initcall(rcu_spawn_tasks_trace_kthread);
 
 #ifndef CONFIG_TINY_RCU
 static void show_rcu_tasks_trace_gp_kthread(void)
@@ -1229,6 +1231,21 @@ void show_rcu_tasks_gp_kthreads(void)
 }
 #endif /* #ifndef CONFIG_TINY_RCU */
 
+void __init rcu_init_tasks_generic(void)
+{
+#ifdef CONFIG_TASKS_RCU
+	rcu_spawn_tasks_kthread();
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+	rcu_spawn_tasks_rude_kthread();
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+	rcu_spawn_tasks_trace_kthread();
+#endif
+}
+
 #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
 static inline void rcu_tasks_bootup_oddness(void) {}
 void show_rcu_tasks_gp_kthreads(void) {}
-- 
cgit v1.2.3


From be98e05a67f05ff4c8349a51fcec993a28be718c Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Fri, 4 Dec 2020 21:02:42 +0100
Subject: dma-buf: Fix kerneldoc formatting

I wanted to look up something and noticed the hyperlink doesn't work.
While fixing that also noticed a trivial kerneldoc comment typo in the
same section, fix that too.

Reviewed-by: Michael J. Ruhl <michael.j.ruhl@intel.com>
Reviewed-by: Simon Ser <contact@emersion.fr>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20201204200242.2671481-1-daniel.vetter@ffwll.ch
---
 Documentation/driver-api/dma-buf.rst | 2 +-
 include/linux/dma-buf-map.h          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/dma-buf.rst b/Documentation/driver-api/dma-buf.rst
index d6b2a195dbed..a2133d69872c 100644
--- a/Documentation/driver-api/dma-buf.rst
+++ b/Documentation/driver-api/dma-buf.rst
@@ -190,7 +190,7 @@ DMA Fence uABI/Sync File
 Indefinite DMA Fences
 ~~~~~~~~~~~~~~~~~~~~~
 
-At various times &dma_fence with an indefinite time until dma_fence_wait()
+At various times struct dma_fence with an indefinite time until dma_fence_wait()
 finishes have been proposed. Examples include:
 
 * Future fences, used in HWC1 to signal when a buffer isn't used by the display
diff --git a/include/linux/dma-buf-map.h b/include/linux/dma-buf-map.h
index 583a3a1f9447..278d489e4bdd 100644
--- a/include/linux/dma-buf-map.h
+++ b/include/linux/dma-buf-map.h
@@ -122,7 +122,7 @@ struct dma_buf_map {
 
 /**
  * DMA_BUF_MAP_INIT_VADDR - Initializes struct dma_buf_map to an address in system memory
- * @vaddr:	A system-memory address
+ * @vaddr_:	A system-memory address
  */
 #define DMA_BUF_MAP_INIT_VADDR(vaddr_) \
 	{ \
-- 
cgit v1.2.3


From a313357e704f2617f298333e3e617a38b1719760 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Dec 2020 20:25:37 +0100
Subject: genirq: Move irq_has_action() into core code

This function uses irq_to_desc() and is going to be used by modules to
replace the open coded irq_to_desc() (ab)usage. The final goal is to remove
the export of irq_to_desc() so driver cannot fiddle with it anymore.

Move it into the core code and fixup the usage sites to include the proper
header.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201210194042.548936472@linutronix.de
---
 arch/alpha/kernel/sys_jensen.c |  2 +-
 arch/x86/kernel/topology.c     |  1 +
 include/linux/interrupt.h      |  1 +
 include/linux/irqdesc.h        |  7 +------
 kernel/irq/manage.c            | 17 +++++++++++++++++
 5 files changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c
index 0a2ab6cb18db..e5d870ff225f 100644
--- a/arch/alpha/kernel/sys_jensen.c
+++ b/arch/alpha/kernel/sys_jensen.c
@@ -7,7 +7,7 @@
  *
  * Code supporting the Jensen.
  */
-
+#include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 0a2ec801b63f..f5477eab5692 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -25,6 +25,7 @@
  *
  * Send feedback to <colpatch@us.ibm.com>
  */
+#include <linux/interrupt.h>
 #include <linux/nodemask.h>
 #include <linux/export.h>
 #include <linux/mmzone.h>
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 870b3251e174..bb8ff9083e7d 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -232,6 +232,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
 # define local_irq_enable_in_hardirq()	local_irq_enable()
 #endif
 
+bool irq_has_action(unsigned int irq);
 extern void disable_irq_nosync(unsigned int irq);
 extern bool disable_hardirq(unsigned int irq);
 extern void disable_irq(unsigned int irq);
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 5745491303e0..385a4fafe631 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -179,12 +179,7 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
 /* Test to see if a driver has successfully requested an irq */
 static inline int irq_desc_has_action(struct irq_desc *desc)
 {
-	return desc->action != NULL;
-}
-
-static inline int irq_has_action(unsigned int irq)
-{
-	return irq_desc_has_action(irq_to_desc(irq));
+	return desc && desc->action != NULL;
 }
 
 /**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c826ba4141fe..a5a1cde5c1a2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2822,3 +2822,20 @@ out_unlock:
 	return err;
 }
 EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
+
+/**
+ * irq_has_action - Check whether an interrupt is requested
+ * @irq:	The linux irq number
+ *
+ * Returns: A snapshot of the current state
+ */
+bool irq_has_action(unsigned int irq)
+{
+	bool res;
+
+	rcu_read_lock();
+	res = irq_desc_has_action(irq_to_desc(irq));
+	rcu_read_unlock();
+	return res;
+}
+EXPORT_SYMBOL_GPL(irq_has_action);
-- 
cgit v1.2.3


From fdd029630434b434b127efc7fba337da28f45658 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Dec 2020 20:25:38 +0100
Subject: genirq: Move status flag checks to core

These checks are used by modules and prevent the removal of the export of
irq_to_desc(). Move the accessor into the core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201210194042.703779349@linutronix.de
---
 include/linux/irqdesc.h | 17 +++++------------
 kernel/irq/manage.c     | 20 ++++++++++++++++++++
 2 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 385a4fafe631..308d7db8991f 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -223,28 +223,21 @@ irq_set_chip_handler_name_locked(struct irq_data *data, struct irq_chip *chip,
 	data->chip = chip;
 }
 
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask);
+
 static inline bool irq_balancing_disabled(unsigned int irq)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-	return desc->status_use_accessors & IRQ_NO_BALANCING_MASK;
+	return irq_check_status_bit(irq, IRQ_NO_BALANCING_MASK);
 }
 
 static inline bool irq_is_percpu(unsigned int irq)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-	return desc->status_use_accessors & IRQ_PER_CPU;
+	return irq_check_status_bit(irq, IRQ_PER_CPU);
 }
 
 static inline bool irq_is_percpu_devid(unsigned int irq)
 {
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-	return desc->status_use_accessors & IRQ_PER_CPU_DEVID;
+	return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID);
 }
 
 static inline void
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a5a1cde5c1a2..ab8567f32501 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2839,3 +2839,23 @@ bool irq_has_action(unsigned int irq)
 	return res;
 }
 EXPORT_SYMBOL_GPL(irq_has_action);
+
+/**
+ * irq_check_status_bit - Check whether bits in the irq descriptor status are set
+ * @irq:	The linux irq number
+ * @bitmask:	The bitmask to evaluate
+ *
+ * Returns: True if one of the bits in @bitmask is set
+ */
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask)
+{
+	struct irq_desc *desc;
+	bool res = false;
+
+	rcu_read_lock();
+	desc = irq_to_desc(irq);
+	if (desc)
+		res = !!(desc->status_use_accessors & bitmask);
+	rcu_read_unlock();
+	return res;
+}
-- 
cgit v1.2.3


From f1c6306c0d6b50844ba02c8a53e35405e9c0db05 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Dec 2020 20:25:39 +0100
Subject: genirq: Move irq_set_lockdep_class() to core

irq_set_lockdep_class() is used from modules and requires irq_to_desc() to
be exported. Move it into the core code which lifts another requirement for
the export.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201210194042.860029489@linutronix.de
---
 include/linux/irqdesc.h | 10 ++++------
 kernel/irq/irqdesc.c    | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 308d7db8991f..4a1d016716f4 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -240,16 +240,14 @@ static inline bool irq_is_percpu_devid(unsigned int irq)
 	return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID);
 }
 
+void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
+			     struct lock_class_key *request_class);
 static inline void
 irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
 		      struct lock_class_key *request_class)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
-
-	if (desc) {
-		lockdep_set_class(&desc->lock, lock_class);
-		lockdep_set_class(&desc->request_mutex, request_class);
-	}
+	if (IS_ENABLED(CONFIG_LOCKDEP))
+		__irq_set_lockdep_class(irq, lock_class, request_class);
 }
 
 #endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index e810eb9906ea..20a54fa7cd30 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -968,3 +968,17 @@ unsigned int kstat_irqs_usr(unsigned int irq)
 	rcu_read_unlock();
 	return sum;
 }
+
+#ifdef CONFIG_LOCKDEP
+void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
+			     struct lock_class_key *request_class)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (desc) {
+		lockdep_set_class(&desc->lock, lock_class);
+		lockdep_set_class(&desc->request_mutex, request_class);
+	}
+}
+EXPORT_SYMBOL_GPL(__irq_set_lockdep_class);
+#endif
-- 
cgit v1.2.3


From 3e2380123fb96987ce958f623207010c667ffa7c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Dec 2020 20:25:40 +0100
Subject: genirq: Provide irq_get_effective_affinity()

Provide an accessor to the effective interrupt affinity mask. Going to be
used to replace open coded fiddling with the irq descriptor.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201210194042.967177918@linutronix.de
---
 include/linux/irq.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c332871d59da..4aeb1c4c7e07 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -906,6 +906,13 @@ struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
 }
 #endif
 
+static inline struct cpumask *irq_get_effective_affinity_mask(unsigned int irq)
+{
+	struct irq_data *d = irq_get_irq_data(irq);
+
+	return d ? irq_data_get_effective_affinity_mask(d) : NULL;
+}
+
 unsigned int arch_dynirq_lower_bound(unsigned int from);
 
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
-- 
cgit v1.2.3


From 26c19d0a8610fb233b31730fe26a31145f2d9796 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Dec 2020 20:25:43 +0100
Subject: genirq: Make kstat_irqs() static

No more users outside the core code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201210194043.268774449@linutronix.de
---
 include/linux/kernel_stat.h |  1 -
 kernel/irq/irqdesc.c        | 19 ++++++-------------
 2 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 89f0745c096d..44ae1a7eb9e3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -67,7 +67,6 @@ static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
 /*
  * Number of interrupts per specific IRQ source, since bootup
  */
-extern unsigned int kstat_irqs(unsigned int irq);
 extern unsigned int kstat_irqs_usr(unsigned int irq);
 
 /*
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 02b446a21ce6..2eb076f4a566 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -924,15 +924,7 @@ static bool irq_is_nmi(struct irq_desc *desc)
 	return desc->istate & IRQS_NMI;
 }
 
-/**
- * kstat_irqs - Get the statistics for an interrupt
- * @irq:	The interrupt number
- *
- * Returns the sum of interrupt counts on all cpus since boot for
- * @irq. The caller must ensure that the interrupt is not removed
- * concurrently.
- */
-unsigned int kstat_irqs(unsigned int irq)
+static unsigned int kstat_irqs(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	unsigned int sum = 0;
@@ -951,13 +943,14 @@ unsigned int kstat_irqs(unsigned int irq)
 }
 
 /**
- * kstat_irqs_usr - Get the statistics for an interrupt
+ * kstat_irqs_usr - Get the statistics for an interrupt from thread context
  * @irq:	The interrupt number
  *
  * Returns the sum of interrupt counts on all cpus since boot for @irq.
- * Contrary to kstat_irqs() this can be called from any context.
- * It uses rcu since a concurrent removal of an interrupt descriptor is
- * observing an rcu grace period before delayed_free_desc()/irq_kobj_release().
+ *
+ * It uses rcu to protect the access since a concurrent removal of an
+ * interrupt descriptor is observing an rcu grace period before
+ * delayed_free_desc()/irq_kobj_release().
  */
 unsigned int kstat_irqs_usr(unsigned int irq)
 {
-- 
cgit v1.2.3


From 501e2db67fa4264b517de5c7934e94cca89b3a1e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Dec 2020 20:25:44 +0100
Subject: genirq: Provide kstat_irqdesc_cpu()

Most users of kstat_irqs_cpu() have the irq descriptor already. No point in
calling into the core code and looking it up once more.

Use it in per_cpu_count_show() to start with.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201210194043.362094758@linutronix.de
---
 include/linux/irqdesc.h | 6 ++++++
 kernel/irq/irqdesc.c    | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 4a1d016716f4..891b323266df 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -113,6 +113,12 @@ static inline void irq_unlock_sparse(void) { }
 extern struct irq_desc irq_desc[NR_IRQS];
 #endif
 
+static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc,
+					      unsigned int cpu)
+{
+	return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+}
+
 static inline struct irq_desc *irq_data_to_desc(struct irq_data *data)
 {
 	return container_of(data->common, struct irq_desc, irq_common_data);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2eb076f4a566..f509c4db2029 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -147,12 +147,12 @@ static ssize_t per_cpu_count_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
 {
 	struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
-	int cpu, irq = desc->irq_data.irq;
 	ssize_t ret = 0;
 	char *p = "";
+	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		unsigned int c = kstat_irqs_cpu(irq, cpu);
+		unsigned int c = irq_desc_kstat_cpu(desc, cpu);
 
 		ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
 		p = ",";
-- 
cgit v1.2.3


From ee2cc4276ba4909438f5894a218877660e1536d9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 14 Dec 2020 21:08:00 +0100
Subject: cpufreq: Add special-purpose fast-switching callback for drivers

First off, some cpufreq drivers (eg. intel_pstate) can pass hints
beyond the current target frequency to the hardware and there are no
provisions for doing that in the cpufreq framework.  In particular,
today the driver has to assume that it should not allow the frequency
to fall below the one requested by the governor (or the required
capacity may not be provided) which may not be the case and which may
lead to excessive energy usage in some scenarios.

Second, the hints passed by these drivers to the hardware need not be
in terms of the frequency, so representing the utilization numbers
coming from the scheduler as frequency before passing them to those
drivers is not really useful.

Address the two points above by adding a special-purpose replacement
for the ->fast_switch callback, called ->adjust_perf, allowing the
governor to pass abstract performance level (rather than frequency)
values for the minimum (required) and target (desired) performance
along with the CPU capacity to compare them to.

Also update the schedutil governor to use the new callback instead
of ->fast_switch if present and if the utilization mertics are
frequency-invariant (that is requisite for the direct mapping
between the utilization and the CPU performance levels to be a
reasonable approximation).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c        | 40 +++++++++++++++++++++++
 include/linux/cpufreq.h          | 14 +++++++++
 include/linux/sched/cpufreq.h    |  5 +++
 kernel/sched/cpufreq_schedutil.c | 68 ++++++++++++++++++++++++++++++++++------
 4 files changed, 117 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index c17aa2973c44..d0a3525ce27f 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2097,6 +2097,46 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
 }
 EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
 
+/**
+ * cpufreq_driver_adjust_perf - Adjust CPU performance level in one go.
+ * @cpu: Target CPU.
+ * @min_perf: Minimum (required) performance level (units of @capacity).
+ * @target_perf: Terget (desired) performance level (units of @capacity).
+ * @capacity: Capacity of the target CPU.
+ *
+ * Carry out a fast performance level switch of @cpu without sleeping.
+ *
+ * The driver's ->adjust_perf() callback invoked by this function must be
+ * suitable for being called from within RCU-sched read-side critical sections
+ * and it is expected to select a suitable performance level equal to or above
+ * @min_perf and preferably equal to or below @target_perf.
+ *
+ * This function must not be called if policy->fast_switch_enabled is unset.
+ *
+ * Governors calling this function must guarantee that it will never be invoked
+ * twice in parallel for the same CPU and that it will never be called in
+ * parallel with either ->target() or ->target_index() or ->fast_switch() for
+ * the same CPU.
+ */
+void cpufreq_driver_adjust_perf(unsigned int cpu,
+				 unsigned long min_perf,
+				 unsigned long target_perf,
+				 unsigned long capacity)
+{
+	cpufreq_driver->adjust_perf(cpu, min_perf, target_perf, capacity);
+}
+
+/**
+ * cpufreq_driver_has_adjust_perf - Check "direct fast switch" callback.
+ *
+ * Return 'true' if the ->adjust_perf callback is present for the
+ * current driver or 'false' otherwise.
+ */
+bool cpufreq_driver_has_adjust_perf(void)
+{
+	return !!cpufreq_driver->adjust_perf;
+}
+
 /* Must set freqs->new to intermediate frequency */
 static int __target_intermediate(struct cpufreq_policy *policy,
 				 struct cpufreq_freqs *freqs, int index)
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 584fccd4fcab..9c8b7437b6cd 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -320,6 +320,15 @@ struct cpufreq_driver {
 					unsigned int index);
 	unsigned int	(*fast_switch)(struct cpufreq_policy *policy,
 				       unsigned int target_freq);
+	/*
+	 * ->fast_switch() replacement for drivers that use an internal
+	 * representation of performance levels and can pass hints other than
+	 * the target performance level to the hardware.
+	 */
+	void		(*adjust_perf)(unsigned int cpu,
+				       unsigned long min_perf,
+				       unsigned long target_perf,
+				       unsigned long capacity);
 
 	/*
 	 * Caches and returns the lowest driver-supported frequency greater than
@@ -588,6 +597,11 @@ struct cpufreq_governor {
 /* Pass a target to the cpufreq driver */
 unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
 					unsigned int target_freq);
+void cpufreq_driver_adjust_perf(unsigned int cpu,
+				unsigned long min_perf,
+				unsigned long target_perf,
+				unsigned long capacity);
+bool cpufreq_driver_has_adjust_perf(void);
 int cpufreq_driver_target(struct cpufreq_policy *policy,
 				 unsigned int target_freq,
 				 unsigned int relation);
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index 3ed5aa18593f..6205578ab6ee 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -28,6 +28,11 @@ static inline unsigned long map_util_freq(unsigned long util,
 {
 	return (freq + (freq >> 2)) * util / cap;
 }
+
+static inline unsigned long map_util_perf(unsigned long util)
+{
+	return util + (util >> 2);
+}
 #endif /* CONFIG_CPU_FREQ */
 
 #endif /* _LINUX_SCHED_CPUFREQ_H */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 319a270d13c1..803bcb30db27 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -432,13 +432,10 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_p
 		sg_policy->limits_changed = true;
 }
 
-static void sugov_update_single(struct update_util_data *hook, u64 time,
-				unsigned int flags)
+static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
+					      u64 time, unsigned int flags)
 {
-	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
-	unsigned int cached_freq = sg_policy->cached_raw_freq;
-	unsigned int next_f;
 
 	sugov_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
@@ -446,11 +443,25 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	ignore_dl_rate_limit(sg_cpu, sg_policy);
 
 	if (!sugov_should_update_freq(sg_policy, time))
-		return;
+		return false;
 
 	sugov_get_util(sg_cpu);
 	sugov_iowait_apply(sg_cpu, time);
 
+	return true;
+}
+
+static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
+				     unsigned int flags)
+{
+	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+	unsigned int cached_freq = sg_policy->cached_raw_freq;
+	unsigned int next_f;
+
+	if (!sugov_update_single_common(sg_cpu, time, flags))
+		return;
+
 	next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
 	/*
 	 * Do not reduce the frequency if the CPU has not been idle
@@ -477,6 +488,38 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	}
 }
 
+static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
+				     unsigned int flags)
+{
+	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+	unsigned long prev_util = sg_cpu->util;
+
+	/*
+	 * Fall back to the "frequency" path if frequency invariance is not
+	 * supported, because the direct mapping between the utilization and
+	 * the performance levels depends on the frequency invariance.
+	 */
+	if (!arch_scale_freq_invariant()) {
+		sugov_update_single_freq(hook, time, flags);
+		return;
+	}
+
+	if (!sugov_update_single_common(sg_cpu, time, flags))
+		return;
+
+	/*
+	 * Do not reduce the target performance level if the CPU has not been
+	 * idle recently, as the reduction is likely to be premature then.
+	 */
+	if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+		sg_cpu->util = prev_util;
+
+	cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
+				   map_util_perf(sg_cpu->util), sg_cpu->max);
+
+	sg_cpu->sg_policy->last_freq_update_time = time;
+}
+
 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 {
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
@@ -815,6 +858,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
 static int sugov_start(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy = policy->governor_data;
+	void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
 	unsigned int cpu;
 
 	sg_policy->freq_update_delay_ns	= sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
@@ -834,13 +878,17 @@ static int sugov_start(struct cpufreq_policy *policy)
 		sg_cpu->sg_policy		= sg_policy;
 	}
 
+	if (policy_is_shared(policy))
+		uu = sugov_update_shared;
+	else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
+		uu = sugov_update_single_perf;
+	else
+		uu = sugov_update_single_freq;
+
 	for_each_cpu(cpu, policy->cpus) {
 		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 
-		cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
-					     policy_is_shared(policy) ?
-							sugov_update_shared :
-							sugov_update_single);
+		cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From c18e68696fdd9fd293f051030bce5aaff3c9b185 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Mon, 14 Dec 2020 21:15:47 -0800
Subject: net/connector: Add const qualifier to cb_id
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The connector driver never modifies any cb_id passed to it, so add a const
qualifier to those arguments so callers can declare their struct cb_id as a
constant object.

Fixes build warnings like these when passing a constant struct cb_id:

  warning: passing argument 1 of ‘cn_add_callback’ discards ‘const’ qualifier from pointer target

Signed-off-by: Geoff Levand <geoff@infradead.org>
Link: https://lore.kernel.org/r/a9e49c9e-67fa-16e7-0a6b-72f6bd30c58a@infradead.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/driver-api/connector.rst |  2 +-
 drivers/connector/cn_queue.c           |  8 ++++----
 drivers/connector/connector.c          |  4 ++--
 include/linux/connector.h              | 10 +++++-----
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/driver-api/connector.rst b/Documentation/driver-api/connector.rst
index 23d068191fb1..631b84a48aa5 100644
--- a/Documentation/driver-api/connector.rst
+++ b/Documentation/driver-api/connector.rst
@@ -25,7 +25,7 @@ handling, etc...  The Connector driver allows any kernelspace agents to use
 netlink based networking for inter-process communication in a significantly
 easier way::
 
-  int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
+  int cn_add_callback(const struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
   void cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask);
   void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask);
 
diff --git a/drivers/connector/cn_queue.c b/drivers/connector/cn_queue.c
index 49295052ba8b..996f025eb63c 100644
--- a/drivers/connector/cn_queue.c
+++ b/drivers/connector/cn_queue.c
@@ -19,7 +19,7 @@
 
 static struct cn_callback_entry *
 cn_queue_alloc_callback_entry(struct cn_queue_dev *dev, const char *name,
-			      struct cb_id *id,
+			      const struct cb_id *id,
 			      void (*callback)(struct cn_msg *,
 					       struct netlink_skb_parms *))
 {
@@ -51,13 +51,13 @@ void cn_queue_release_callback(struct cn_callback_entry *cbq)
 	kfree(cbq);
 }
 
-int cn_cb_equal(struct cb_id *i1, struct cb_id *i2)
+int cn_cb_equal(const struct cb_id *i1, const struct cb_id *i2)
 {
 	return ((i1->idx == i2->idx) && (i1->val == i2->val));
 }
 
 int cn_queue_add_callback(struct cn_queue_dev *dev, const char *name,
-			  struct cb_id *id,
+			  const struct cb_id *id,
 			  void (*callback)(struct cn_msg *,
 					   struct netlink_skb_parms *))
 {
@@ -90,7 +90,7 @@ int cn_queue_add_callback(struct cn_queue_dev *dev, const char *name,
 	return 0;
 }
 
-void cn_queue_del_callback(struct cn_queue_dev *dev, struct cb_id *id)
+void cn_queue_del_callback(struct cn_queue_dev *dev, const struct cb_id *id)
 {
 	struct cn_callback_entry *cbq, *n;
 	int found = 0;
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 7d59d18c6f26..48ec7ce6ecac 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -193,7 +193,7 @@ static void cn_rx_skb(struct sk_buff *skb)
  *
  * May sleep.
  */
-int cn_add_callback(struct cb_id *id, const char *name,
+int cn_add_callback(const struct cb_id *id, const char *name,
 		    void (*callback)(struct cn_msg *,
 				     struct netlink_skb_parms *))
 {
@@ -214,7 +214,7 @@ EXPORT_SYMBOL_GPL(cn_add_callback);
  *
  * May sleep while waiting for reference counter to become zero.
  */
-void cn_del_callback(struct cb_id *id)
+void cn_del_callback(const struct cb_id *id)
 {
 	struct cn_dev *dev = &cdev;
 
diff --git a/include/linux/connector.h b/include/linux/connector.h
index cb732643471b..8ea860efea37 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -64,14 +64,14 @@ struct cn_dev {
  * @callback:	connector's callback.
  * 		parameters are %cn_msg and the sender's credentials
  */
-int cn_add_callback(struct cb_id *id, const char *name,
+int cn_add_callback(const struct cb_id *id, const char *name,
 		    void (*callback)(struct cn_msg *, struct netlink_skb_parms *));
 /**
  * cn_del_callback() - Unregisters new callback with connector core.
  *
  * @id:		unique connector's user identifier.
  */
-void cn_del_callback(struct cb_id *id);
+void cn_del_callback(const struct cb_id *id);
 
 
 /**
@@ -122,14 +122,14 @@ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 group, gfp
 int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 group, gfp_t gfp_mask);
 
 int cn_queue_add_callback(struct cn_queue_dev *dev, const char *name,
-			  struct cb_id *id,
+			  const struct cb_id *id,
 			  void (*callback)(struct cn_msg *, struct netlink_skb_parms *));
-void cn_queue_del_callback(struct cn_queue_dev *dev, struct cb_id *id);
+void cn_queue_del_callback(struct cn_queue_dev *dev, const struct cb_id *id);
 void cn_queue_release_callback(struct cn_callback_entry *);
 
 struct cn_queue_dev *cn_queue_alloc_dev(const char *name, struct sock *);
 void cn_queue_free_dev(struct cn_queue_dev *dev);
 
-int cn_cb_equal(struct cb_id *, struct cb_id *);
+int cn_cb_equal(const struct cb_id *, const struct cb_id *);
 
 #endif				/* __CONNECTOR_H */
-- 
cgit v1.2.3


From 7061eb8cfa902daa1ec71d23b5cddb8b4391e72b Mon Sep 17 00:00:00 2001
From: Lijun Pan <ljp@linux.ibm.com>
Date: Mon, 14 Dec 2020 15:19:28 -0600
Subject: net: core: introduce __netdev_notify_peers

There are some use cases for netdev_notify_peers in the context
when rtnl lock is already held. Introduce lockless version
of netdev_notify_peers call to save the extra code to call
	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
After that, convert netdev_notify_peers to call the new helper.

Suggested-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Lijun Pan <ljp@linux.ibm.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 22 ++++++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7bf167993c05..259be67644e3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4547,6 +4547,7 @@ void __dev_set_rx_mode(struct net_device *dev);
 int dev_set_promiscuity(struct net_device *dev, int inc);
 int dev_set_allmulti(struct net_device *dev, int inc);
 void netdev_state_change(struct net_device *dev);
+void __netdev_notify_peers(struct net_device *dev);
 void netdev_notify_peers(struct net_device *dev);
 void netdev_features_change(struct net_device *dev);
 /* Load a device via the kmod */
diff --git a/net/core/dev.c b/net/core/dev.c
index a46334906c94..8fa739259041 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1456,6 +1456,25 @@ void netdev_state_change(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_state_change);
 
+/**
+ * __netdev_notify_peers - notify network peers about existence of @dev,
+ * to be called when rtnl lock is already held.
+ * @dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void __netdev_notify_peers(struct net_device *dev)
+{
+	ASSERT_RTNL();
+	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+}
+EXPORT_SYMBOL(__netdev_notify_peers);
+
 /**
  * netdev_notify_peers - notify network peers about existence of @dev
  * @dev: network device
@@ -1469,8 +1488,7 @@ EXPORT_SYMBOL(netdev_state_change);
 void netdev_notify_peers(struct net_device *dev)
 {
 	rtnl_lock();
-	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
-	call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+	__netdev_notify_peers(dev);
 	rtnl_unlock();
 }
 EXPORT_SYMBOL(netdev_notify_peers);
-- 
cgit v1.2.3


From 767143a18d6d743d4254de5cf55b1bd87bb2af18 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 14 Dec 2020 22:37:50 -0800
Subject: phy: fix kdoc warning

Kdoc does not like it when multiline comment follows the networking
style of starting right on the first line:

include/linux/phy.h:869: warning: Function parameter or member 'config_intr' not described in 'phy_driver'

Link: https://lore.kernel.org/r/20201215063750.3120976-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/phy.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 381a95732b6a..9effb511acde 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -743,7 +743,8 @@ struct phy_driver {
 	/** @read_status: Determines the negotiated speed and duplex */
 	int (*read_status)(struct phy_device *phydev);
 
-	/** @config_intr: Enables or disables interrupts.
+	/**
+	 * @config_intr: Enables or disables interrupts.
 	 * It should also clear any pending interrupts prior to enabling the
 	 * IRQs and after disabling them.
 	 */
-- 
cgit v1.2.3


From 3df23a316c4a5d1764b034c71c29d67a17d5299f Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <uwe@kleine-koenig.org>
Date: Sat, 5 Dec 2020 17:19:24 +0100
Subject: pwm: Remove unused function pwmchip_add_inversed()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is only defined with CONFIG_PWM unset and was introduced together
with pwmchip_add_with_polarity() (which is only defined with CONFIG_PWM
enabled). I guess the series that introduced pwmchip_add_with_polarity()
had a different concept in earlier revisions and the !CONFIG_PWM part
was just not updated accordingly.

Given that there is no implementation for pwmchip_add_with_polarity()
without CONFIG_PWM, just drop pwmchip_add_inversed() instead of renaming
it to pwmchip_add_with_polarity().

Signed-off-by: Uwe Kleine-König <uwe@kleine-koenig.org>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
---
 include/linux/pwm.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index a13ff383fa1d..e4d84d4db293 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -473,11 +473,6 @@ static inline int pwmchip_add(struct pwm_chip *chip)
 	return -EINVAL;
 }
 
-static inline int pwmchip_add_inversed(struct pwm_chip *chip)
-{
-	return -EINVAL;
-}
-
 static inline int pwmchip_remove(struct pwm_chip *chip)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 49e27134f6e9ebcd08c04a98ab7f0574b5a81a35 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Sun, 13 Dec 2020 14:06:41 +0200
Subject: net/mlx5: Fix compilation warning for 32-bit platform
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MLX5_GENERAL_OBJECT_TYPES types bitfield is 64-bit field.

Defining an enum for such bit fields on 32-bit platform results in below
warning.

./include/vdso/bits.h:7:26: warning: left shift count >= width of type [-Wshift-count-overflow]
                         ^
./include/linux/mlx5/mlx5_ifc.h:10716:46: note: in expansion of macro ‘BIT’
 MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT(0x20),
                                             ^~~

Use 32-bit friendly BIT_ULL macro.

Fixes: 2a2970891647 ("net/mlx5: Add sample offload hardware bits and structures")
Signed-off-by: Parav Pandit <parav@nvidia.com>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Link: https://lore.kernel.org/r/20201213120641.216032-1-leon@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mlx5/mlx5_ifc.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 0d6e287d614f..8fbddec26eb8 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -10711,9 +10711,9 @@ struct mlx5_ifc_affiliated_event_header_bits {
 };
 
 enum {
-	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT(0xc),
-	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT(0x13),
-	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT(0x20),
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT_ULL(0xc),
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT_ULL(0x13),
+	MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT_ULL(0x20),
 };
 
 enum {
-- 
cgit v1.2.3


From 9bd23c31f392bda88618008f27fd52ee9e0fac38 Mon Sep 17 00:00:00 2001
From: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Date: Fri, 20 Nov 2020 12:22:32 -0800
Subject: jbd2: add a helper to find out number of fast commit blocks

Add a helper to read number of fast commit blocks from jbd2 superblock
and also rename the JBD2_MIN_FC_BLKS to
JBD2_DEFAULT_FAST_COMMIT_BLOCKS since this constant is just the
default number of fast commit blocks to use in case number of fast
commit blocks isn't set in jbd2 superblock.

Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Link: https://lore.kernel.org/r/20201120202232.2240293-2-harshadshirwadkar@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c    | 8 ++------
 include/linux/jbd2.h | 9 ++++++++-
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 188f79d76988..2dc944442802 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1869,9 +1869,7 @@ static int load_superblock(journal_t *journal)
 
 	if (jbd2_has_feature_fast_commit(journal)) {
 		journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
-		num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks);
-		if (!num_fc_blocks)
-			num_fc_blocks = JBD2_MIN_FC_BLOCKS;
+		num_fc_blocks = jbd2_journal_get_num_fc_blks(sb);
 		if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
 			journal->j_last = journal->j_fc_last - num_fc_blocks;
 		journal->j_fc_first = journal->j_last + 1;
@@ -2102,9 +2100,7 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
 	journal_superblock_t *sb = journal->j_superblock;
 	unsigned long long num_fc_blks;
 
-	num_fc_blks = be32_to_cpu(sb->s_num_fc_blks);
-	if (num_fc_blks == 0)
-		num_fc_blks = JBD2_MIN_FC_BLOCKS;
+	num_fc_blks = jbd2_journal_get_num_fc_blks(sb);
 	if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
 		return -ENOSPC;
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d2a4860feb72..99d3cd051ac3 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -68,7 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags);
 extern void jbd2_free(void *ptr, size_t size);
 
 #define JBD2_MIN_JOURNAL_BLOCKS 1024
-#define JBD2_MIN_FC_BLOCKS	256
+#define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256
 
 #ifdef __KERNEL__
 
@@ -1692,6 +1692,13 @@ static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
 	return journal->j_chksum_driver != NULL;
 }
 
+static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb)
+{
+	int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks);
+
+	return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS;
+}
+
 /*
  * Return number of free blocks in the log. Must be called under j_state_lock.
  */
-- 
cgit v1.2.3


From 476c135e321716ad7a8a5d4a19a636e2dcc50526 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Thu, 12 Nov 2020 08:39:59 +0200
Subject: vdpa: Add missing comment for virtqueue count

Add missing comment for number of virtqueue.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Link: https://lore.kernel.org/r/20201112064005.349268-2-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vdpa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 30bc7a7223bb..0fefeb976877 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -42,6 +42,7 @@ struct vdpa_vq_state {
  * @config: the configuration ops for this device.
  * @index: device index
  * @features_valid: were features initialized? for legacy guests
+ * @nvqs: maximum number of supported virtqueues
  */
 struct vdpa_device {
 	struct device dev;
-- 
cgit v1.2.3


From a4055888629bc0467d12d912cd7c90acdf3d9b12 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linux.alibaba.com>
Date: Fri, 18 Dec 2020 14:01:31 -0800
Subject: mm/memcg: warning on !memcg after readahead page charged

Add VM_WARN_ON_ONCE_PAGE() macro.

Since readahead page is charged on memcg too, in theory we don't have to
check this exception now.  Before safely remove them all, add a warning
for the unexpected !memcg.

Link: https://lkml.kernel.org/r/1604283436-18880-3-git-send-email-alex.shi@linux.alibaba.com
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmdebug.h | 13 +++++++++++++
 mm/memcontrol.c         | 10 ++++------
 2 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 2ad72d2c8cc5..5d0767cb424a 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -37,6 +37,18 @@ void dump_mm(const struct mm_struct *mm);
 			BUG();						\
 		}							\
 	} while (0)
+#define VM_WARN_ON_ONCE_PAGE(cond, page)	({			\
+	static bool __section(".data.once") __warned;			\
+	int __ret_warn_once = !!(cond);					\
+									\
+	if (unlikely(__ret_warn_once && !__warned)) {			\
+		dump_page(page, "VM_WARN_ON_ONCE_PAGE(" __stringify(cond)")");\
+		__warned = true;					\
+		WARN_ON(1);						\
+	}								\
+	unlikely(__ret_warn_once);					\
+})
+
 #define VM_WARN_ON(cond) (void)WARN_ON(cond)
 #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
 #define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format)
@@ -48,6 +60,7 @@ void dump_mm(const struct mm_struct *mm);
 #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
 #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE_PAGE(cond, page)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5c3b054066f5..7b9766789a27 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1362,10 +1362,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd
 	}
 
 	memcg = page_memcg(page);
-	/*
-	 * Swapcache readahead pages are added to the LRU - and
-	 * possibly migrated - before they are charged.
-	 */
+	VM_WARN_ON_ONCE_PAGE(!memcg, page);
 	if (!memcg)
 		memcg = root_mem_cgroup;
 
@@ -6987,6 +6984,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
 		return;
 
 	memcg = page_memcg(oldpage);
+	VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
 	if (!memcg)
 		return;
 
@@ -7186,7 +7184,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 
 	memcg = page_memcg(page);
 
-	/* Readahead page, never charged */
+	VM_WARN_ON_ONCE_PAGE(!memcg, page);
 	if (!memcg)
 		return;
 
@@ -7253,7 +7251,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 
 	memcg = page_memcg(page);
 
-	/* Readahead page, never charged */
+	VM_WARN_ON_ONCE_PAGE(!memcg, page);
 	if (!memcg)
 		return 0;
 
-- 
cgit v1.2.3


From bec78efd0061365a76f88e498affd7106b256823 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Fri, 18 Dec 2020 14:01:35 -0800
Subject: mm/memcg: remove unused definitions

Some definitions are left unused, just clean them.

Link: https://lkml.kernel.org/r/20201108003834.12669-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 118 ---------------------------------------------
 1 file changed, 118 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 08ed57e02b73..196441f5dc99 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -913,41 +913,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 	local_irq_restore(flags);
 }
 
-/**
- * mod_memcg_page_state - update page state statistics
- * @page: the page
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * The @page must be locked or the caller must use lock_page_memcg()
- * to prevent double accounting when the page is concurrently being
- * moved to another memcg:
- *
- *   lock_page(page) or lock_page_memcg(page)
- *   if (TestClearPageState(page))
- *     mod_memcg_page_state(page, state, -1);
- *   unlock_page(page) or unlock_page_memcg(page)
- *
- * Kernel pages are an exception to this, since they'll never move.
- */
-static inline void __mod_memcg_page_state(struct page *page,
-					  int idx, int val)
-{
-	struct mem_cgroup *memcg = page_memcg(page);
-
-	if (memcg)
-		__mod_memcg_state(memcg, idx, val);
-}
-
-static inline void mod_memcg_page_state(struct page *page,
-					int idx, int val)
-{
-	struct mem_cgroup *memcg = page_memcg(page);
-
-	if (memcg)
-		mod_memcg_state(memcg, idx, val);
-}
-
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 					      enum node_stat_item idx)
 {
@@ -1395,18 +1360,6 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 {
 }
 
-static inline void __mod_memcg_page_state(struct page *page,
-					  int idx,
-					  int nr)
-{
-}
-
-static inline void mod_memcg_page_state(struct page *page,
-					int idx,
-					int nr)
-{
-}
-
 static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 					      enum node_stat_item idx)
 {
@@ -1479,34 +1432,6 @@ static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
 }
 #endif /* CONFIG_MEMCG */
 
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __inc_memcg_state(struct mem_cgroup *memcg,
-				     int idx)
-{
-	__mod_memcg_state(memcg, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __dec_memcg_state(struct mem_cgroup *memcg,
-				     int idx)
-{
-	__mod_memcg_state(memcg, idx, -1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __inc_memcg_page_state(struct page *page,
-					  int idx)
-{
-	__mod_memcg_page_state(page, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __dec_memcg_page_state(struct page *page,
-					  int idx)
-{
-	__mod_memcg_page_state(page, idx, -1);
-}
-
 static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
 {
 	__mod_lruvec_kmem_state(p, idx, 1);
@@ -1517,34 +1442,6 @@ static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
 	__mod_lruvec_kmem_state(p, idx, -1);
 }
 
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void inc_memcg_state(struct mem_cgroup *memcg,
-				   int idx)
-{
-	mod_memcg_state(memcg, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void dec_memcg_state(struct mem_cgroup *memcg,
-				   int idx)
-{
-	mod_memcg_state(memcg, idx, -1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void inc_memcg_page_state(struct page *page,
-					int idx)
-{
-	mod_memcg_page_state(page, idx, 1);
-}
-
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void dec_memcg_page_state(struct page *page,
-					int idx)
-{
-	mod_memcg_page_state(page, idx, -1);
-}
-
 static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
 {
 	struct mem_cgroup *memcg;
@@ -1733,21 +1630,6 @@ static inline void memcg_kmem_uncharge_page(struct page *page, int order)
 		__memcg_kmem_uncharge_page(page, order);
 }
 
-static inline int memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
-				    unsigned int nr_pages)
-{
-	if (memcg_kmem_enabled())
-		return __memcg_kmem_charge(memcg, gfp, nr_pages);
-	return 0;
-}
-
-static inline void memcg_kmem_uncharge(struct mem_cgroup *memcg,
-				       unsigned int nr_pages)
-{
-	if (memcg_kmem_enabled())
-		__memcg_kmem_uncharge(memcg, nr_pages);
-}
-
 /*
  * A helper for accessing memcg's kmem_id, used for getting
  * corresponding LRU lists.
-- 
cgit v1.2.3


From 9a1ac2288cf16f9406ca54ef221bfcf262393b15 Mon Sep 17 00:00:00 2001
From: Hui Su <sh_def@163.com>
Date: Fri, 18 Dec 2020 14:01:41 -0800
Subject: mm/memcontrol:rewrite mem_cgroup_page_lruvec()

mem_cgroup_page_lruvec() in memcontrol.c and mem_cgroup_lruvec() in
memcontrol.h is very similar except for the param(page and memcg) which
also can be convert to each other.

So rewrite mem_cgroup_page_lruvec() with mem_cgroup_lruvec().

[alex.shi@linux.alibaba.com: add missed warning in mem_cgroup_lruvec]
  Link: https://lkml.kernel.org/r/94f17bb7-ec61-5b72-3555-fabeb5a4d73b@linux.alibaba.com
[lstoakes@gmail.com: warn on missing memcg on mem_cgroup_page_lruvec()]
  Link: https://lkml.kernel.org/r/20201125112202.387009-1-lstoakes@gmail.com

Link: https://lkml.kernel.org/r/20201108143731.GA74138@rlk
Signed-off-by: Hui Su <sh_def@163.com>
Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Yafang Shao <laoar.shao@gmail.com>
Cc: Chris Down <chris@chrisdown.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 19 +++++++++++++++++--
 mm/memcontrol.c            | 37 -------------------------------------
 2 files changed, 17 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 196441f5dc99..d827bd7f3bfe 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -620,9 +620,10 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
 /**
  * mem_cgroup_lruvec - get the lru list vector for a memcg & node
  * @memcg: memcg of the wanted lruvec
+ * @pgdat: pglist_data
  *
  * Returns the lru list vector holding pages for a given @memcg &
- * @node combination. This can be the node lruvec, if the memory
+ * @pgdat combination. This can be the node lruvec, if the memory
  * controller is disabled.
  */
 static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
@@ -652,7 +653,21 @@ out:
 	return lruvec;
 }
 
-struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
+/**
+ * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
+ * @page: the page
+ * @pgdat: pgdat of the page
+ *
+ * This function relies on page->mem_cgroup being stable.
+ */
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
+						struct pglist_data *pgdat)
+{
+	struct mem_cgroup *memcg = page_memcg(page);
+
+	VM_WARN_ON_ONCE_PAGE(!memcg, page);
+	return mem_cgroup_lruvec(memcg, pgdat);
+}
 
 static inline bool lruvec_holds_page_lru_lock(struct page *page,
 					      struct lruvec *lruvec)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b9766789a27..605f671203ef 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1342,43 +1342,6 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
 }
 #endif
 
-/**
- * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
- * @page: the page
- * @pgdat: pgdat of the page
- *
- * This function relies on page's memcg being stable - see the
- * access rules in commit_charge().
- */
-struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
-{
-	struct mem_cgroup_per_node *mz;
-	struct mem_cgroup *memcg;
-	struct lruvec *lruvec;
-
-	if (mem_cgroup_disabled()) {
-		lruvec = &pgdat->__lruvec;
-		goto out;
-	}
-
-	memcg = page_memcg(page);
-	VM_WARN_ON_ONCE_PAGE(!memcg, page);
-	if (!memcg)
-		memcg = root_mem_cgroup;
-
-	mz = mem_cgroup_page_nodeinfo(memcg, page);
-	lruvec = &mz->lruvec;
-out:
-	/*
-	 * Since a node can be onlined after the mem_cgroup was created,
-	 * we have to be prepared to initialize lruvec->zone here;
-	 * and if offlined then reonlined, we need to reinitialize it.
-	 */
-	if (unlikely(lruvec->pgdat != pgdat))
-		lruvec->pgdat = pgdat;
-	return lruvec;
-}
-
 /**
  * lock_page_lruvec - lock and return lruvec for a given page.
  * @page: the page
-- 
cgit v1.2.3


From b0a0c2615f6f199a656ed8549d7dce625d77aa77 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Fri, 18 Dec 2020 14:05:41 -0800
Subject: epoll: wire up syscall epoll_pwait2

Split off from prev patch in the series that implements the syscall.

Link: https://lkml.kernel.org/r/20201121144401.3727659-4-willemdebruijn.kernel@gmail.com
Signed-off-by: Willem de Bruijn <willemb@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/syscalls/syscall.tbl      | 1 +
 arch/arm/tools/syscall.tbl                  | 1 +
 arch/arm64/include/asm/unistd.h             | 2 +-
 arch/arm64/include/asm/unistd32.h           | 2 ++
 arch/ia64/kernel/syscalls/syscall.tbl       | 1 +
 arch/m68k/kernel/syscalls/syscall.tbl       | 1 +
 arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   | 1 +
 arch/parisc/kernel/syscalls/syscall.tbl     | 1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    | 1 +
 arch/s390/kernel/syscalls/syscall.tbl       | 1 +
 arch/sh/kernel/syscalls/syscall.tbl         | 1 +
 arch/sparc/kernel/syscalls/syscall.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_32.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl      | 1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     | 1 +
 include/linux/compat.h                      | 6 ++++++
 include/linux/syscalls.h                    | 5 +++++
 include/uapi/asm-generic/unistd.h           | 4 +++-
 kernel/sys_ni.c                             | 2 ++
 22 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index ee7b01bb7346..a6617067dbe6 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -480,3 +480,4 @@
 548	common	pidfd_getfd			sys_pidfd_getfd
 549	common	faccessat2			sys_faccessat2
 550	common	process_madvise			sys_process_madvise
+551	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index d056a548358e..20e1170e2e0a 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -454,3 +454,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index b3b2019f8d16..86a9d7b3eabe 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		441
+#define __NR_compat_syscalls		442
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 107f08e03b9f..f4bca2b90218 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -889,6 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
 #define __NR_process_madvise 440
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_epoll_pwait2 441
+__SYSCALL(__NR_epoll_pwait2, sys_epoll_pwait2)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index b96ed8b8a508..bfc00f2bd437 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -361,3 +361,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 625fb6d32842..7fe4e45c864c 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index aae729c95cf9..a522adf194ab 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -446,3 +446,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 32817c954435..ad9c3dd0ab1f 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -379,3 +379,4 @@
 438	n32	pidfd_getfd			sys_pidfd_getfd
 439	n32	faccessat2			sys_faccessat2
 440	n32	process_madvise			sys_process_madvise
+441	n32	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 9e4ea3c31b1c..91649690b52f 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -355,3 +355,4 @@
 438	n64	pidfd_getfd			sys_pidfd_getfd
 439	n64	faccessat2			sys_faccessat2
 440	n64	process_madvise			sys_process_madvise
+441	n64	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 29f5f28cf5ce..4bad0c40aed6 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -428,3 +428,4 @@
 438	o32	pidfd_getfd			sys_pidfd_getfd
 439	o32	faccessat2			sys_faccessat2
 440	o32	process_madvise			sys_process_madvise
+441	o32	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index f375ea528e59..6bcc31966b44 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -438,3 +438,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 1275daec7fec..f744eb5cba88 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -530,3 +530,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2		compat_sys_epoll_pwait2
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 28c168000483..14f6525886a8 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -443,3 +443,4 @@
 438  common	pidfd_getfd		sys_pidfd_getfd			sys_pidfd_getfd
 439  common	faccessat2		sys_faccessat2			sys_faccessat2
 440  common	process_madvise		sys_process_madvise		sys_process_madvise
+441  common	epoll_pwait2		sys_epoll_pwait2		sys_epoll_pwait2
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 783738448ff5..9df40ac0ebc0 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -443,3 +443,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index 78160260991b..c7da4c3271e6 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -486,3 +486,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 0d0667a9fbd7..874aeacde2dd 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -445,3 +445,4 @@
 438	i386	pidfd_getfd		sys_pidfd_getfd
 439	i386	faccessat2		sys_faccessat2
 440	i386	process_madvise		sys_process_madvise
+441	i386	epoll_pwait2		sys_epoll_pwait2		compat_sys_epoll_pwait2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 379819244b91..78672124d28b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -362,6 +362,7 @@
 438	common	pidfd_getfd		sys_pidfd_getfd
 439	common	faccessat2		sys_faccessat2
 440	common	process_madvise		sys_process_madvise
+441	common	epoll_pwait2		sys_epoll_pwait2
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index b070f272995d..46116a28eeed 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -411,3 +411,4 @@
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	faccessat2			sys_faccessat2
 440	common	process_madvise			sys_process_madvise
+441	common	epoll_pwait2			sys_epoll_pwait2
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 400c0941c8af..6e65be753603 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -537,6 +537,12 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
 			int maxevents, int timeout,
 			const compat_sigset_t __user *sigmask,
 			compat_size_t sigsetsize);
+asmlinkage long compat_sys_epoll_pwait2(int epfd,
+			struct epoll_event __user *events,
+			int maxevents,
+			const struct __kernel_timespec __user *timeout,
+			const compat_sigset_t __user *sigmask,
+			compat_size_t sigsetsize);
 
 /* fs/fcntl.c */
 asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index df0c3c74609e..f3929aff39cf 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -362,6 +362,11 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
 				int maxevents, int timeout,
 				const sigset_t __user *sigmask,
 				size_t sigsetsize);
+asmlinkage long sys_epoll_pwait2(int epfd, struct epoll_event __user *events,
+				 int maxevents,
+				 const struct __kernel_timespec __user *timeout,
+				 const sigset_t __user *sigmask,
+				 size_t sigsetsize);
 
 /* fs/fcntl.c */
 asmlinkage long sys_dup(unsigned int fildes);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index fc48c64700eb..728752917785 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
 #define __NR_process_madvise 440
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_epoll_pwait2 441
+__SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2)
 
 #undef __NR_syscalls
-#define __NR_syscalls 441
+#define __NR_syscalls 442
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index f27ac94d5fa7..19aa806890d5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -68,6 +68,8 @@ COND_SYSCALL(epoll_create1);
 COND_SYSCALL(epoll_ctl);
 COND_SYSCALL(epoll_pwait);
 COND_SYSCALL_COMPAT(epoll_pwait);
+COND_SYSCALL(epoll_pwait2);
+COND_SYSCALL_COMPAT(epoll_pwait2);
 
 /* fs/fcntl.c */
 
-- 
cgit v1.2.3


From 3b1a4a8640876a966ab68ab4f561642e19674671 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:00:14 -0800
Subject: kasan: group vmalloc code

This is a preparatory commit for the upcoming addition of a new hardware
tag-based (MTE-based) KASAN mode.

Group all vmalloc-related function declarations in include/linux/kasan.h,
and their implementations in mm/kasan/common.c.

No functional changes.

Link: https://lkml.kernel.org/r/80a6fdd29b039962843bd6cf22ce2643a7c8904e.1606161801.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 41 +++++++++++++++------------
 mm/kasan/common.c     | 78 ++++++++++++++++++++++++++-------------------------
 2 files changed, 63 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 30d343b4a40a..59538e795df4 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -75,19 +75,6 @@ struct kasan_cache {
 	int free_meta_offset;
 };
 
-/*
- * These functions provide a special case to support backing module
- * allocations with real shadow memory. With KASAN vmalloc, the special
- * case is unnecessary, as the work is handled in the generic case.
- */
-#ifndef CONFIG_KASAN_VMALLOC
-int kasan_module_alloc(void *addr, size_t size);
-void kasan_free_shadow(const struct vm_struct *vm);
-#else
-static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
-static inline void kasan_free_shadow(const struct vm_struct *vm) {}
-#endif
-
 int kasan_add_zero_shadow(void *start, unsigned long size);
 void kasan_remove_zero_shadow(void *start, unsigned long size);
 
@@ -156,9 +143,6 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
 	return false;
 }
 
-static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
-static inline void kasan_free_shadow(const struct vm_struct *vm) {}
-
 static inline int kasan_add_zero_shadow(void *start, unsigned long size)
 {
 	return 0;
@@ -211,13 +195,16 @@ static inline void *kasan_reset_tag(const void *addr)
 #endif /* CONFIG_KASAN_SW_TAGS */
 
 #ifdef CONFIG_KASAN_VMALLOC
+
 int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
 void kasan_poison_vmalloc(const void *start, unsigned long size);
 void kasan_unpoison_vmalloc(const void *start, unsigned long size);
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
 			   unsigned long free_region_end);
-#else
+
+#else /* CONFIG_KASAN_VMALLOC */
+
 static inline int kasan_populate_vmalloc(unsigned long start,
 					unsigned long size)
 {
@@ -232,7 +219,25 @@ static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
 					 unsigned long free_region_end) {}
-#endif
+
+#endif /* CONFIG_KASAN_VMALLOC */
+
+#if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC)
+
+/*
+ * These functions provide a special case to support backing module
+ * allocations with real shadow memory. With KASAN vmalloc, the special
+ * case is unnecessary, as the work is handled in the generic case.
+ */
+int kasan_module_alloc(void *addr, size_t size);
+void kasan_free_shadow(const struct vm_struct *vm);
+
+#else /* CONFIG_KASAN && !CONFIG_KASAN_VMALLOC */
+
+static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+static inline void kasan_free_shadow(const struct vm_struct *vm) {}
+
+#endif /* CONFIG_KASAN && !CONFIG_KASAN_VMALLOC */
 
 #ifdef CONFIG_KASAN_INLINE
 void kasan_non_canonical_hook(unsigned long addr);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 33d863f55db1..89e5ef9417a7 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -536,44 +536,6 @@ void kasan_kfree_large(void *ptr, unsigned long ip)
 	/* The object will be poisoned by page_alloc. */
 }
 
-#ifndef CONFIG_KASAN_VMALLOC
-int kasan_module_alloc(void *addr, size_t size)
-{
-	void *ret;
-	size_t scaled_size;
-	size_t shadow_size;
-	unsigned long shadow_start;
-
-	shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
-	scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT;
-	shadow_size = round_up(scaled_size, PAGE_SIZE);
-
-	if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
-		return -EINVAL;
-
-	ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
-			shadow_start + shadow_size,
-			GFP_KERNEL,
-			PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
-			__builtin_return_address(0));
-
-	if (ret) {
-		__memset(ret, KASAN_SHADOW_INIT, shadow_size);
-		find_vm_area(addr)->flags |= VM_KASAN;
-		kmemleak_ignore(ret);
-		return 0;
-	}
-
-	return -ENOMEM;
-}
-
-void kasan_free_shadow(const struct vm_struct *vm)
-{
-	if (vm->flags & VM_KASAN)
-		vfree(kasan_mem_to_shadow(vm->addr));
-}
-#endif
-
 #ifdef CONFIG_MEMORY_HOTPLUG
 static bool shadow_mapped(unsigned long addr)
 {
@@ -685,6 +647,7 @@ core_initcall(kasan_memhotplug_init);
 #endif
 
 #ifdef CONFIG_KASAN_VMALLOC
+
 static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
 				      void *unused)
 {
@@ -923,4 +886,43 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
 				       (unsigned long)shadow_end);
 	}
 }
+
+#else /* CONFIG_KASAN_VMALLOC */
+
+int kasan_module_alloc(void *addr, size_t size)
+{
+	void *ret;
+	size_t scaled_size;
+	size_t shadow_size;
+	unsigned long shadow_start;
+
+	shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+	scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT;
+	shadow_size = round_up(scaled_size, PAGE_SIZE);
+
+	if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+		return -EINVAL;
+
+	ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+			shadow_start + shadow_size,
+			GFP_KERNEL,
+			PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+			__builtin_return_address(0));
+
+	if (ret) {
+		__memset(ret, KASAN_SHADOW_INIT, shadow_size);
+		find_vm_area(addr)->flags |= VM_KASAN;
+		kmemleak_ignore(ret);
+		return 0;
+	}
+
+	return -ENOMEM;
+}
+
+void kasan_free_shadow(const struct vm_struct *vm)
+{
+	if (vm->flags & VM_KASAN)
+		vfree(kasan_mem_to_shadow(vm->addr));
+}
+
 #endif
-- 
cgit v1.2.3


From d5750edf6da759576f91ec2b57d5553985815b40 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:00:17 -0800
Subject: kasan: shadow declarations only for software modes

This is a preparatory commit for the upcoming addition of a new hardware
tag-based (MTE-based) KASAN mode.

Group shadow-related KASAN function declarations and only define them for
the two existing software modes.

No functional changes for software modes.

  Link: https://lkml.kernel.org/r/35126.1606402815@turing-police
  Link: https://lore.kernel.org/linux-arm-kernel/24105.1606397102@turing-police/

Link: https://lkml.kernel.org/r/e88d94eff94db883a65dca52e1736d80d28dd9bc.1606161801.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Signed-off-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
[valdis.kletnieks@vt.edu: fix build issue with asmlinkage]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 48 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 59538e795df4..7828436a3a99 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -11,7 +11,7 @@ struct task_struct;
 
 #ifdef CONFIG_KASAN
 
-#include <linux/pgtable.h>
+#include <linux/linkage.h>
 #include <asm/kasan.h>
 
 /* kasan_data struct is used in KUnit tests for KASAN expected failures */
@@ -20,6 +20,20 @@ struct kunit_kasan_expectation {
 	bool report_found;
 };
 
+#endif
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+
+#include <linux/pgtable.h>
+
+/* Software KASAN implementations use shadow memory. */
+
+#ifdef CONFIG_KASAN_SW_TAGS
+#define KASAN_SHADOW_INIT 0xFF
+#else
+#define KASAN_SHADOW_INIT 0
+#endif
+
 extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
 extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE];
 extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
@@ -35,6 +49,23 @@ static inline void *kasan_mem_to_shadow(const void *addr)
 		+ KASAN_SHADOW_OFFSET;
 }
 
+int kasan_add_zero_shadow(void *start, unsigned long size);
+void kasan_remove_zero_shadow(void *start, unsigned long size);
+
+#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+static inline int kasan_add_zero_shadow(void *start, unsigned long size)
+{
+	return 0;
+}
+static inline void kasan_remove_zero_shadow(void *start,
+					unsigned long size)
+{}
+
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+#ifdef CONFIG_KASAN
+
 /* Enable reporting bugs after kasan_disable_current() */
 extern void kasan_enable_current(void);
 
@@ -75,9 +106,6 @@ struct kasan_cache {
 	int free_meta_offset;
 };
 
-int kasan_add_zero_shadow(void *start, unsigned long size);
-void kasan_remove_zero_shadow(void *start, unsigned long size);
-
 size_t __ksize(const void *);
 static inline void kasan_unpoison_slab(const void *ptr)
 {
@@ -143,14 +171,6 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
 	return false;
 }
 
-static inline int kasan_add_zero_shadow(void *start, unsigned long size)
-{
-	return 0;
-}
-static inline void kasan_remove_zero_shadow(void *start,
-					unsigned long size)
-{}
-
 static inline void kasan_unpoison_slab(const void *ptr) { }
 static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 
@@ -158,8 +178,6 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 
 #ifdef CONFIG_KASAN_GENERIC
 
-#define KASAN_SHADOW_INIT 0
-
 void kasan_cache_shrink(struct kmem_cache *cache);
 void kasan_cache_shutdown(struct kmem_cache *cache);
 void kasan_record_aux_stack(void *ptr);
@@ -174,8 +192,6 @@ static inline void kasan_record_aux_stack(void *ptr) {}
 
 #ifdef CONFIG_KASAN_SW_TAGS
 
-#define KASAN_SHADOW_INIT 0xFF
-
 void kasan_init_tags(void);
 
 void *kasan_reset_tag(const void *addr);
-- 
cgit v1.2.3


From cebd0eb29acdfc2f5e44e5f356ffcd0c44f16b4a Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:00:21 -0800
Subject: kasan: rename (un)poison_shadow to (un)poison_range

This is a preparatory commit for the upcoming addition of a new hardware
tag-based (MTE-based) KASAN mode.

The new mode won't be using shadow memory.  Rename external annotation
kasan_unpoison_shadow() to kasan_unpoison_range(), and introduce internal
functions (un)poison_range() (without kasan_ prefix).

Co-developed-by: Marco Elver <elver@google.com>
Link: https://lkml.kernel.org/r/fccdcaa13dc6b2211bf363d6c6d499279a54fe3a.1606161801.git.andreyknvl@google.com
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  6 +++---
 kernel/fork.c         |  4 ++--
 mm/kasan/common.c     | 49 +++++++++++++++++++++++++++----------------------
 mm/kasan/generic.c    | 23 +++++++++++------------
 mm/kasan/kasan.h      |  3 ++-
 mm/kasan/tags.c       |  2 +-
 mm/slab_common.c      |  2 +-
 7 files changed, 47 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 7828436a3a99..9740c06a04a1 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -72,7 +72,7 @@ extern void kasan_enable_current(void);
 /* Disable reporting bugs for current task */
 extern void kasan_disable_current(void);
 
-void kasan_unpoison_shadow(const void *address, size_t size);
+void kasan_unpoison_range(const void *address, size_t size);
 
 void kasan_unpoison_task_stack(struct task_struct *task);
 
@@ -109,7 +109,7 @@ struct kasan_cache {
 size_t __ksize(const void *);
 static inline void kasan_unpoison_slab(const void *ptr)
 {
-	kasan_unpoison_shadow(ptr, __ksize(ptr));
+	kasan_unpoison_range(ptr, __ksize(ptr));
 }
 size_t kasan_metadata_size(struct kmem_cache *cache);
 
@@ -118,7 +118,7 @@ void kasan_restore_multi_shot(bool enabled);
 
 #else /* CONFIG_KASAN */
 
-static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
+static inline void kasan_unpoison_range(const void *address, size_t size) {}
 
 static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 41906a52a764..37720a6d04ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -225,8 +225,8 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 		if (!s)
 			continue;
 
-		/* Clear the KASAN shadow of the stack. */
-		kasan_unpoison_shadow(s->addr, THREAD_SIZE);
+		/* Mark stack accessible for KASAN. */
+		kasan_unpoison_range(s->addr, THREAD_SIZE);
 
 		/* Clear stale pointers from reused stack. */
 		memset(s->addr, 0, THREAD_SIZE);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 89e5ef9417a7..73e79a34671b 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -108,7 +108,7 @@ void *memcpy(void *dest, const void *src, size_t len)
  * Poisons the shadow memory for 'size' bytes starting from 'addr'.
  * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
  */
-void kasan_poison_shadow(const void *address, size_t size, u8 value)
+void poison_range(const void *address, size_t size, u8 value)
 {
 	void *shadow_start, *shadow_end;
 
@@ -125,7 +125,7 @@ void kasan_poison_shadow(const void *address, size_t size, u8 value)
 	__memset(shadow_start, value, shadow_end - shadow_start);
 }
 
-void kasan_unpoison_shadow(const void *address, size_t size)
+void unpoison_range(const void *address, size_t size)
 {
 	u8 tag = get_tag(address);
 
@@ -136,7 +136,7 @@ void kasan_unpoison_shadow(const void *address, size_t size)
 	 */
 	address = reset_tag(address);
 
-	kasan_poison_shadow(address, size, tag);
+	poison_range(address, size, tag);
 
 	if (size & KASAN_SHADOW_MASK) {
 		u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
@@ -148,12 +148,17 @@ void kasan_unpoison_shadow(const void *address, size_t size)
 	}
 }
 
+void kasan_unpoison_range(const void *address, size_t size)
+{
+	unpoison_range(address, size);
+}
+
 static void __kasan_unpoison_stack(struct task_struct *task, const void *sp)
 {
 	void *base = task_stack_page(task);
 	size_t size = sp - base;
 
-	kasan_unpoison_shadow(base, size);
+	unpoison_range(base, size);
 }
 
 /* Unpoison the entire stack for a task. */
@@ -172,7 +177,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
 	 */
 	void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
 
-	kasan_unpoison_shadow(base, watermark - base);
+	unpoison_range(base, watermark - base);
 }
 
 void kasan_alloc_pages(struct page *page, unsigned int order)
@@ -186,13 +191,13 @@ void kasan_alloc_pages(struct page *page, unsigned int order)
 	tag = random_tag();
 	for (i = 0; i < (1 << order); i++)
 		page_kasan_tag_set(page + i, tag);
-	kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+	unpoison_range(page_address(page), PAGE_SIZE << order);
 }
 
 void kasan_free_pages(struct page *page, unsigned int order)
 {
 	if (likely(!PageHighMem(page)))
-		kasan_poison_shadow(page_address(page),
+		poison_range(page_address(page),
 				PAGE_SIZE << order,
 				KASAN_FREE_PAGE);
 }
@@ -284,18 +289,18 @@ void kasan_poison_slab(struct page *page)
 
 	for (i = 0; i < compound_nr(page); i++)
 		page_kasan_tag_reset(page + i);
-	kasan_poison_shadow(page_address(page), page_size(page),
-			KASAN_KMALLOC_REDZONE);
+	poison_range(page_address(page), page_size(page),
+		     KASAN_KMALLOC_REDZONE);
 }
 
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
 {
-	kasan_unpoison_shadow(object, cache->object_size);
+	unpoison_range(object, cache->object_size);
 }
 
 void kasan_poison_object_data(struct kmem_cache *cache, void *object)
 {
-	kasan_poison_shadow(object,
+	poison_range(object,
 			round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
 			KASAN_KMALLOC_REDZONE);
 }
@@ -408,7 +413,7 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
 	}
 
 	rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE);
-	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+	poison_range(object, rounded_up_size, KASAN_KMALLOC_FREE);
 
 	if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) ||
 			unlikely(!(cache->flags & SLAB_KASAN)))
@@ -448,9 +453,9 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
 		tag = assign_tag(cache, object, false, keep_tag);
 
 	/* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
-	kasan_unpoison_shadow(set_tag(object, tag), size);
-	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
-		KASAN_KMALLOC_REDZONE);
+	unpoison_range(set_tag(object, tag), size);
+	poison_range((void *)redzone_start, redzone_end - redzone_start,
+		     KASAN_KMALLOC_REDZONE);
 
 	if (cache->flags & SLAB_KASAN)
 		kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags);
@@ -489,9 +494,9 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
 				KASAN_SHADOW_SCALE_SIZE);
 	redzone_end = (unsigned long)ptr + page_size(page);
 
-	kasan_unpoison_shadow(ptr, size);
-	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
-		KASAN_PAGE_REDZONE);
+	unpoison_range(ptr, size);
+	poison_range((void *)redzone_start, redzone_end - redzone_start,
+		     KASAN_PAGE_REDZONE);
 
 	return (void *)ptr;
 }
@@ -523,7 +528,7 @@ void kasan_poison_kfree(void *ptr, unsigned long ip)
 			kasan_report_invalid_free(ptr, ip);
 			return;
 		}
-		kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE);
+		poison_range(ptr, page_size(page), KASAN_FREE_PAGE);
 	} else {
 		__kasan_slab_free(page->slab_cache, ptr, ip, false);
 	}
@@ -709,7 +714,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
 	 * // vmalloc() allocates memory
 	 * // let a = area->addr
 	 * // we reach kasan_populate_vmalloc
-	 * // and call kasan_unpoison_shadow:
+	 * // and call unpoison_range:
 	 * STORE shadow(a), unpoison_val
 	 * ...
 	 * STORE shadow(a+99), unpoison_val	x = LOAD p
@@ -744,7 +749,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size)
 		return;
 
 	size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
-	kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID);
+	poison_range(start, size, KASAN_VMALLOC_INVALID);
 }
 
 void kasan_unpoison_vmalloc(const void *start, unsigned long size)
@@ -752,7 +757,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size)
 	if (!is_vmalloc_or_module_addr(start))
 		return;
 
-	kasan_unpoison_shadow(start, size);
+	unpoison_range(start, size);
 }
 
 static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index d341859a1b95..9fe44f9b3b30 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -202,11 +202,11 @@ static void register_global(struct kasan_global *global)
 {
 	size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
 
-	kasan_unpoison_shadow(global->beg, global->size);
+	unpoison_range(global->beg, global->size);
 
-	kasan_poison_shadow(global->beg + aligned_size,
-		global->size_with_redzone - aligned_size,
-		KASAN_GLOBAL_REDZONE);
+	poison_range(global->beg + aligned_size,
+		     global->size_with_redzone - aligned_size,
+		     KASAN_GLOBAL_REDZONE);
 }
 
 void __asan_register_globals(struct kasan_global *globals, size_t size)
@@ -285,13 +285,12 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
 
 	WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
 
-	kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
-			      size - rounded_down_size);
-	kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
-			KASAN_ALLOCA_LEFT);
-	kasan_poison_shadow(right_redzone,
-			padding_size + KASAN_ALLOCA_REDZONE_SIZE,
-			KASAN_ALLOCA_RIGHT);
+	unpoison_range((const void *)(addr + rounded_down_size),
+		       size - rounded_down_size);
+	poison_range(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
+		     KASAN_ALLOCA_LEFT);
+	poison_range(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE,
+		     KASAN_ALLOCA_RIGHT);
 }
 EXPORT_SYMBOL(__asan_alloca_poison);
 
@@ -301,7 +300,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
 	if (unlikely(!stack_top || stack_top > stack_bottom))
 		return;
 
-	kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
+	unpoison_range(stack_top, stack_bottom - stack_top);
 }
 EXPORT_SYMBOL(__asan_allocas_unpoison);
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ac499456740f..42ab02c61331 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -150,7 +150,8 @@ static inline bool addr_has_shadow(const void *addr)
 	return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
 }
 
-void kasan_poison_shadow(const void *address, size_t size, u8 value);
+void poison_range(const void *address, size_t size, u8 value);
+void unpoison_range(const void *address, size_t size);
 
 /**
  * check_memory_region - Check memory region, and report if invalid access.
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 5c8b08a25715..c0b3f327812b 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -153,7 +153,7 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort);
 
 void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
 {
-	kasan_poison_shadow((void *)addr, size, tag);
+	poison_range((void *)addr, size, tag);
 }
 EXPORT_SYMBOL(__hwasan_tag_memory);
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2f2b55c2798e..573fbacd9ef5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1176,7 +1176,7 @@ size_t ksize(const void *objp)
 	 * We assume that ksize callers could use whole allocated area,
 	 * so we need to unpoison this area.
 	 */
-	kasan_unpoison_shadow(objp, size);
+	kasan_unpoison_range(objp, size);
 	return size;
 }
 EXPORT_SYMBOL(ksize);
-- 
cgit v1.2.3


From d73b49365ee65ac48074bdb5aa717bb4644dbbb7 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:00:56 -0800
Subject: kasan, arm64: only use kasan_depth for software modes

This is a preparatory commit for the upcoming addition of a new hardware
tag-based (MTE-based) KASAN mode.

Hardware tag-based KASAN won't use kasan_depth.  Only define and use it
when one of the software KASAN modes are enabled.

No functional changes for software modes.

Link: https://lkml.kernel.org/r/e16f15aeda90bc7fb4dfc2e243a14b74cc5c8219.1606161801.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/kasan_init.c | 11 ++++++++---
 include/linux/kasan.h      | 18 +++++++++---------
 include/linux/sched.h      |  2 +-
 init/init_task.c           |  2 +-
 mm/kasan/common.c          |  2 ++
 mm/kasan/report.c          |  2 ++
 6 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index ffeb80d5aa8d..5172799f831f 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -273,17 +273,22 @@ static void __init kasan_init_shadow(void)
 	cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 }
 
+static void __init kasan_init_depth(void)
+{
+	init_task.kasan_depth = 0;
+}
+
 #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) */
 
 static inline void __init kasan_init_shadow(void) { }
 
+static inline void __init kasan_init_depth(void) { }
+
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
 void __init kasan_init(void)
 {
 	kasan_init_shadow();
-
-	/* At this point kasan is fully initialized. Enable error messages */
-	init_task.kasan_depth = 0;
+	kasan_init_depth();
 	pr_info("KernelAddressSanitizer initialized\n");
 }
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 9740c06a04a1..b272960e8396 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -52,6 +52,12 @@ static inline void *kasan_mem_to_shadow(const void *addr)
 int kasan_add_zero_shadow(void *start, unsigned long size);
 void kasan_remove_zero_shadow(void *start, unsigned long size);
 
+/* Enable reporting bugs after kasan_disable_current() */
+extern void kasan_enable_current(void);
+
+/* Disable reporting bugs for current task */
+extern void kasan_disable_current(void);
+
 #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
 static inline int kasan_add_zero_shadow(void *start, unsigned long size)
@@ -62,16 +68,13 @@ static inline void kasan_remove_zero_shadow(void *start,
 					unsigned long size)
 {}
 
+static inline void kasan_enable_current(void) {}
+static inline void kasan_disable_current(void) {}
+
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
 #ifdef CONFIG_KASAN
 
-/* Enable reporting bugs after kasan_disable_current() */
-extern void kasan_enable_current(void);
-
-/* Disable reporting bugs for current task */
-extern void kasan_disable_current(void);
-
 void kasan_unpoison_range(const void *address, size_t size);
 
 void kasan_unpoison_task_stack(struct task_struct *task);
@@ -122,9 +125,6 @@ static inline void kasan_unpoison_range(const void *address, size_t size) {}
 
 static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
 
-static inline void kasan_enable_current(void) {}
-static inline void kasan_disable_current(void) {}
-
 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
 static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 51d535b69bd6..6e3a5eeec509 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1234,7 +1234,7 @@ struct task_struct {
 	u64				timer_slack_ns;
 	u64				default_timer_slack_ns;
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	unsigned int			kasan_depth;
 #endif
 
diff --git a/init/init_task.c b/init/init_task.c
index 15f6eb93a04f..8a992d73e6fb 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -176,7 +176,7 @@ struct task_struct init_task
 	.numa_group	= NULL,
 	.numa_faults	= NULL,
 #endif
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	.kasan_depth	= 1,
 #endif
 #ifdef CONFIG_KCSAN
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 663ffa71cd20..d5f23b2f170a 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -46,6 +46,7 @@ void kasan_set_track(struct kasan_track *track, gfp_t flags)
 	track->stack = kasan_save_stack(flags);
 }
 
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 void kasan_enable_current(void)
 {
 	current->kasan_depth++;
@@ -55,6 +56,7 @@ void kasan_disable_current(void)
 {
 	current->kasan_depth--;
 }
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
 void kasan_unpoison_range(const void *address, size_t size)
 {
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index d140d26cfb31..914ab5cfc3ea 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -292,8 +292,10 @@ static void print_shadow_for_address(const void *addr)
 
 static bool report_enabled(void)
 {
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	if (current->kasan_depth)
 		return false;
+#endif
 	if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
 		return true;
 	return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
-- 
cgit v1.2.3


From 60a3a5fe950f4e6c02e9fc6676dc96de043ed743 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:01:03 -0800
Subject: kasan, arm64: rename kasan_init_tags and mark as __init

Rename kasan_init_tags() to kasan_init_sw_tags() as the upcoming hardware
tag-based KASAN mode will have its own initialization routine.  Also
similarly to kasan_init() mark kasan_init_tags() as __init.

Link: https://lkml.kernel.org/r/71e52af72a09f4b50c8042f16101c60e50649fbb.1606161801.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/kernel/setup.c  | 2 +-
 arch/arm64/mm/kasan_init.c | 2 +-
 include/linux/kasan.h      | 4 ++--
 mm/kasan/sw_tags.c         | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index c44eb4b80163..c18aacde8bb0 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -358,7 +358,7 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p)
 	smp_build_mpidr_hash();
 
 	/* Init percpu seeds for random tags after cpus are set up. */
-	kasan_init_tags();
+	kasan_init_sw_tags();
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 	/*
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index e35ce04beed1..d8e66c78440e 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -283,7 +283,7 @@ void __init kasan_init(void)
 	kasan_init_shadow();
 	kasan_init_depth();
 #if defined(CONFIG_KASAN_GENERIC)
-	/* CONFIG_KASAN_SW_TAGS also requires kasan_init_tags(). */
+	/* CONFIG_KASAN_SW_TAGS also requires kasan_init_sw_tags(). */
 	pr_info("KernelAddressSanitizer initialized\n");
 #endif
 }
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b272960e8396..d7042d129dc1 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -192,7 +192,7 @@ static inline void kasan_record_aux_stack(void *ptr) {}
 
 #ifdef CONFIG_KASAN_SW_TAGS
 
-void kasan_init_tags(void);
+void __init kasan_init_sw_tags(void);
 
 void *kasan_reset_tag(const void *addr);
 
@@ -201,7 +201,7 @@ bool kasan_report(unsigned long addr, size_t size,
 
 #else /* CONFIG_KASAN_SW_TAGS */
 
-static inline void kasan_init_tags(void) { }
+static inline void kasan_init_sw_tags(void) { }
 
 static inline void *kasan_reset_tag(const void *addr)
 {
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 9445cf4ccdc8..7317d5229b2b 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -35,7 +35,7 @@
 
 static DEFINE_PER_CPU(u32, prng_state);
 
-void kasan_init_tags(void)
+void __init kasan_init_sw_tags(void)
 {
 	int cpu;
 
-- 
cgit v1.2.3


From 0fea6e9af889f1a4e072f5de999e07fe6859fc88 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:02:06 -0800
Subject: kasan, arm64: expand CONFIG_KASAN checks

Some #ifdef CONFIG_KASAN checks are only relevant for software KASAN modes
(either related to shadow memory or compiler instrumentation).  Expand
those into CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS.

Link: https://lkml.kernel.org/r/e6971e432dbd72bb897ff14134ebb7e169bdcf0c.1606161801.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig                 |  2 +-
 arch/arm64/Makefile                |  2 +-
 arch/arm64/include/asm/assembler.h |  2 +-
 arch/arm64/include/asm/memory.h    |  2 +-
 arch/arm64/include/asm/string.h    |  5 +++--
 arch/arm64/kernel/head.S           |  2 +-
 arch/arm64/kernel/image-vars.h     |  2 +-
 arch/arm64/kernel/kaslr.c          |  3 ++-
 arch/arm64/kernel/module.c         |  6 ++++--
 arch/arm64/mm/ptdump.c             |  6 +++---
 include/linux/kasan-checks.h       |  2 +-
 include/linux/kasan.h              |  7 ++++---
 include/linux/moduleloader.h       |  3 ++-
 include/linux/string.h             |  2 +-
 mm/ptdump.c                        | 13 ++++++++-----
 scripts/Makefile.lib               |  2 ++
 16 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a33718f8b62f..9386b108b132 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -334,7 +334,7 @@ config BROKEN_GAS_INST
 
 config KASAN_SHADOW_OFFSET
 	hex
-	depends on KASAN
+	depends on KASAN_GENERIC || KASAN_SW_TAGS
 	default 0xdfff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && !KASAN_SW_TAGS
 	default 0xdfffc00000000000 if ARM64_VA_BITS_47 && !KASAN_SW_TAGS
 	default 0xdffffe0000000000 if ARM64_VA_BITS_42 && !KASAN_SW_TAGS
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 05f46a60b245..6be9b3750250 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -137,7 +137,7 @@ head-y		:= arch/arm64/kernel/head.o
 
 ifeq ($(CONFIG_KASAN_SW_TAGS), y)
 KASAN_SHADOW_SCALE_SHIFT := 4
-else
+else ifeq ($(CONFIG_KASAN_GENERIC), y)
 KASAN_SHADOW_SCALE_SHIFT := 3
 endif
 
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index ddbe6bf00e33..bf125c591116 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -473,7 +473,7 @@ USER(\label, ic	ivau, \tmp2)			// invalidate I line PoU
 #define NOKPROBE(x)
 #endif
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 #define EXPORT_SYMBOL_NOKASAN(name)
 #else
 #define EXPORT_SYMBOL_NOKASAN(name)	EXPORT_SYMBOL(name)
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 3bc08e6cf82e..cd671fb6707c 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -72,7 +72,7 @@
  * address space for the shadow region respectively. They can bloat the stack
  * significantly, so double the (minimum) stack size when they are in use.
  */
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 #define KASAN_SHADOW_OFFSET	_AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
 #define KASAN_SHADOW_END	((UL(1) << (64 - KASAN_SHADOW_SCALE_SHIFT)) \
 					+ KASAN_SHADOW_OFFSET)
diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h
index b31e8e87a0db..3a3264ff47b9 100644
--- a/arch/arm64/include/asm/string.h
+++ b/arch/arm64/include/asm/string.h
@@ -5,7 +5,7 @@
 #ifndef __ASM_STRING_H
 #define __ASM_STRING_H
 
-#ifndef CONFIG_KASAN
+#if !(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
 #define __HAVE_ARCH_STRRCHR
 extern char *strrchr(const char *, int c);
 
@@ -48,7 +48,8 @@ extern void *__memset(void *, int, __kernel_size_t);
 void memcpy_flushcache(void *dst, const void *src, size_t cnt);
 #endif
 
-#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
+	!defined(__SANITIZE_ADDRESS__)
 
 /*
  * For files that are not instrumented (e.g. mm/slub.c) we
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 42b23ce679dc..a0dc987724ed 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -433,7 +433,7 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 	bl	__pi_memset
 	dsb	ishst				// Make zero page visible to PTW
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	bl	kasan_early_init
 #endif
 #ifdef CONFIG_RANDOMIZE_BASE
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 39289d75118d..f676243abac6 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -37,7 +37,7 @@ __efistub_strncmp		= __pi_strncmp;
 __efistub_strrchr		= __pi_strrchr;
 __efistub___clean_dcache_area_poc = __pi___clean_dcache_area_poc;
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 __efistub___memcpy		= __pi_memcpy;
 __efistub___memmove		= __pi_memmove;
 __efistub___memset		= __pi_memset;
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 0921aa1520b0..1c74c45b9494 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -161,7 +161,8 @@ u64 __init kaslr_early_init(u64 dt_phys)
 	/* use the top 16 bits to randomize the linear region */
 	memstart_offset_seed = seed >> 48;
 
-	if (IS_ENABLED(CONFIG_KASAN))
+	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
 		/*
 		 * KASAN does not expect the module region to intersect the
 		 * vmalloc region, since shadow memory is allocated for each
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 2a1ad95d9b2c..fe21e0f06492 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -30,7 +30,8 @@ void *module_alloc(unsigned long size)
 	if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
 		gfp_mask |= __GFP_NOWARN;
 
-	if (IS_ENABLED(CONFIG_KASAN))
+	if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+	    IS_ENABLED(CONFIG_KASAN_SW_TAGS))
 		/* don't exceed the static module region - see below */
 		module_alloc_end = MODULES_END;
 
@@ -39,7 +40,8 @@ void *module_alloc(unsigned long size)
 				NUMA_NO_NODE, __builtin_return_address(0));
 
 	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
-	    !IS_ENABLED(CONFIG_KASAN))
+	    !IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+	    !IS_ENABLED(CONFIG_KASAN_SW_TAGS))
 		/*
 		 * KASAN can only deal with module allocations being served
 		 * from the reserved module region, since the remainder of
diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c
index 807dc634bbd2..04137a8f3d2d 100644
--- a/arch/arm64/mm/ptdump.c
+++ b/arch/arm64/mm/ptdump.c
@@ -29,7 +29,7 @@
 enum address_markers_idx {
 	PAGE_OFFSET_NR = 0,
 	PAGE_END_NR,
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	KASAN_START_NR,
 #endif
 };
@@ -37,7 +37,7 @@ enum address_markers_idx {
 static struct addr_marker address_markers[] = {
 	{ PAGE_OFFSET,			"Linear Mapping start" },
 	{ 0 /* PAGE_END */,		"Linear Mapping end" },
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	{ 0 /* KASAN_SHADOW_START */,	"Kasan shadow start" },
 	{ KASAN_SHADOW_END,		"Kasan shadow end" },
 #endif
@@ -383,7 +383,7 @@ void ptdump_check_wx(void)
 static int ptdump_init(void)
 {
 	address_markers[PAGE_END_NR].start_address = PAGE_END;
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	address_markers[KASAN_START_NR].start_address = KASAN_SHADOW_START;
 #endif
 	ptdump_initialize();
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
index ac6aba632f2d..ca5e89fb10d3 100644
--- a/include/linux/kasan-checks.h
+++ b/include/linux/kasan-checks.h
@@ -9,7 +9,7 @@
  * even in compilation units that selectively disable KASAN, but must use KASAN
  * to validate access to an address.   Never use these in header files!
  */
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 bool __kasan_check_read(const volatile void *p, unsigned int size);
 bool __kasan_check_write(const volatile void *p, unsigned int size);
 #else
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d7042d129dc1..b1381ee6922a 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -238,7 +238,8 @@ static inline void kasan_release_vmalloc(unsigned long start,
 
 #endif /* CONFIG_KASAN_VMALLOC */
 
-#if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC)
+#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
+		!defined(CONFIG_KASAN_VMALLOC)
 
 /*
  * These functions provide a special case to support backing module
@@ -248,12 +249,12 @@ static inline void kasan_release_vmalloc(unsigned long start,
 int kasan_module_alloc(void *addr, size_t size);
 void kasan_free_shadow(const struct vm_struct *vm);
 
-#else /* CONFIG_KASAN && !CONFIG_KASAN_VMALLOC */
+#else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
 
 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
 static inline void kasan_free_shadow(const struct vm_struct *vm) {}
 
-#endif /* CONFIG_KASAN && !CONFIG_KASAN_VMALLOC */
+#endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */
 
 #ifdef CONFIG_KASAN_INLINE
 void kasan_non_canonical_hook(unsigned long addr);
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 4fa67a8b2265..9e09d11ffe5b 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -96,7 +96,8 @@ void module_arch_cleanup(struct module *mod);
 /* Any cleanup before freeing mod->module_init */
 void module_arch_freeing_init(struct module *mod);
 
-#if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC)
+#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
+		!defined(CONFIG_KASAN_VMALLOC)
 #include <linux/kasan.h>
 #define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
 #else
diff --git a/include/linux/string.h b/include/linux/string.h
index 1cd63a8a23ab..4fcfb56abcf5 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -267,7 +267,7 @@ void __write_overflow(void) __compiletime_error("detected write beyond size of o
 
 #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
 extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
 extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
diff --git a/mm/ptdump.c b/mm/ptdump.c
index ba88ec43ff21..4354c1422d57 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -4,7 +4,7 @@
 #include <linux/ptdump.h>
 #include <linux/kasan.h>
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 /*
  * This is an optimization for KASAN=y case. Since all kasan page tables
  * eventually point to the kasan_early_shadow_page we could call note_page()
@@ -31,7 +31,8 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
 	struct ptdump_state *st = walk->private;
 	pgd_t val = READ_ONCE(*pgd);
 
-#if CONFIG_PGTABLE_LEVELS > 4 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 4 && \
+		(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
 	if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d)))
 		return note_kasan_page_table(walk, addr);
 #endif
@@ -51,7 +52,8 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
 	struct ptdump_state *st = walk->private;
 	p4d_t val = READ_ONCE(*p4d);
 
-#if CONFIG_PGTABLE_LEVELS > 3 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 3 && \
+		(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
 	if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud)))
 		return note_kasan_page_table(walk, addr);
 #endif
@@ -71,7 +73,8 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
 	struct ptdump_state *st = walk->private;
 	pud_t val = READ_ONCE(*pud);
 
-#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 2 && \
+		(defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
 	if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
 		return note_kasan_page_table(walk, addr);
 #endif
@@ -91,7 +94,7 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
 	struct ptdump_state *st = walk->private;
 	pmd_t val = READ_ONCE(*pmd);
 
-#if defined(CONFIG_KASAN)
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
 		return note_kasan_page_table(walk, addr);
 #endif
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 94133708889d..213677a5ed33 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -148,10 +148,12 @@ endif
 # we don't want to check (depends on variables KASAN_SANITIZE_obj.o, KASAN_SANITIZE)
 #
 ifeq ($(CONFIG_KASAN),y)
+ifneq ($(CONFIG_KASAN_HW_TAGS),y)
 _c_flags += $(if $(patsubst n%,, \
 		$(KASAN_SANITIZE_$(basetarget).o)$(KASAN_SANITIZE)y), \
 		$(CFLAGS_KASAN), $(CFLAGS_KASAN_NOSANITIZE))
 endif
+endif
 
 ifeq ($(CONFIG_UBSAN),y)
 _c_flags += $(if $(patsubst n%,, \
-- 
cgit v1.2.3


From 2e903b91479782b7dedd869603423d77e079d3de Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:02:10 -0800
Subject: kasan, arm64: implement HW_TAGS runtime

Provide implementation of KASAN functions required for the hardware
tag-based mode.  Those include core functions for memory and pointer
tagging (tags_hw.c) and bug reporting (report_tags_hw.c).  Also adapt
common KASAN code to support the new mode.

Link: https://lkml.kernel.org/r/cfd0fbede579a6b66755c98c88c108e54f9c56bf.1606161801.git.andreyknvl@google.com
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Alexander Potapenko <glider@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Marco Elver <elver@google.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/memory.h   |  4 +-
 arch/arm64/kernel/cpufeature.c    |  3 ++
 arch/arm64/kernel/smp.c           |  2 +
 include/linux/kasan.h             | 24 ++++++++----
 include/linux/mm.h                |  2 +-
 include/linux/page-flags-layout.h |  2 +-
 mm/kasan/Makefile                 |  5 +++
 mm/kasan/common.c                 | 15 ++++----
 mm/kasan/hw_tags.c                | 80 +++++++++++++++++++++++++++++++++++++++
 mm/kasan/kasan.h                  | 19 ++++++++--
 mm/kasan/report_hw_tags.c         | 42 ++++++++++++++++++++
 mm/kasan/report_sw_tags.c         |  2 +-
 mm/kasan/shadow.c                 |  2 +-
 mm/kasan/sw_tags.c                |  2 +-
 14 files changed, 178 insertions(+), 26 deletions(-)
 create mode 100644 mm/kasan/hw_tags.c
 create mode 100644 mm/kasan/report_hw_tags.c

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index cd671fb6707c..18fce223b67b 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -214,7 +214,7 @@ static inline unsigned long kaslr_offset(void)
 	(__force __typeof__(addr))__addr;				\
 })
 
-#ifdef CONFIG_KASAN_SW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 #define __tag_shifted(tag)	((u64)(tag) << 56)
 #define __tag_reset(addr)	__untagged_addr(addr)
 #define __tag_get(addr)		(__u8)((u64)(addr) >> 56)
@@ -222,7 +222,7 @@ static inline unsigned long kaslr_offset(void)
 #define __tag_shifted(tag)	0UL
 #define __tag_reset(addr)	(addr)
 #define __tag_get(addr)		0
-#endif /* CONFIG_KASAN_SW_TAGS */
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
 
 static inline const void *__tag_set(const void *addr, u8 tag)
 {
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index d87cfc6246e0..7ffb5f1d8b68 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -70,6 +70,7 @@
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/cpu.h>
+#include <linux/kasan.h>
 #include <asm/cpu.h>
 #include <asm/cpufeature.h>
 #include <asm/cpu_ops.h>
@@ -1710,6 +1711,8 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
 		cleared_zero_page = true;
 		mte_clear_page_tags(lm_alias(empty_zero_page));
 	}
+
+	kasan_init_hw_tags_cpu();
 }
 #endif /* CONFIG_ARM64_MTE */
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 2499b895efea..19b1705ae5cb 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -462,6 +462,8 @@ void __init smp_prepare_boot_cpu(void)
 	/* Conditionally switch to GIC PMR for interrupt masking */
 	if (system_uses_irq_prio_masking())
 		init_gic_priority_masking();
+
+	kasan_init_hw_tags();
 }
 
 static u64 __init of_get_cpu_mpidr(struct device_node *dn)
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index b1381ee6922a..d22ec4c9c1bd 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -190,25 +190,35 @@ static inline void kasan_record_aux_stack(void *ptr) {}
 
 #endif /* CONFIG_KASAN_GENERIC */
 
-#ifdef CONFIG_KASAN_SW_TAGS
-
-void __init kasan_init_sw_tags(void);
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 void *kasan_reset_tag(const void *addr);
 
 bool kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 
-#else /* CONFIG_KASAN_SW_TAGS */
-
-static inline void kasan_init_sw_tags(void) { }
+#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
 
 static inline void *kasan_reset_tag(const void *addr)
 {
 	return (void *)addr;
 }
 
-#endif /* CONFIG_KASAN_SW_TAGS */
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS*/
+
+#ifdef CONFIG_KASAN_SW_TAGS
+void __init kasan_init_sw_tags(void);
+#else
+static inline void kasan_init_sw_tags(void) { }
+#endif
+
+#ifdef CONFIG_KASAN_HW_TAGS
+void kasan_init_hw_tags_cpu(void);
+void __init kasan_init_hw_tags(void);
+#else
+static inline void kasan_init_hw_tags_cpu(void) { }
+static inline void kasan_init_hw_tags(void) { }
+#endif
 
 #ifdef CONFIG_KASAN_VMALLOC
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 362579ad0758..024ec0a00c72 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1421,7 +1421,7 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
-#ifdef CONFIG_KASAN_SW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 static inline u8 page_kasan_tag(const struct page *page)
 {
 	return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
index e200eef6a7fd..7d4ec26d8a3e 100644
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -77,7 +77,7 @@
 #define LAST_CPUPID_SHIFT 0
 #endif
 
-#ifdef CONFIG_KASAN_SW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 #define KASAN_TAG_WIDTH 8
 #else
 #define KASAN_TAG_WIDTH 0
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index f1d68a34f3c9..9fe39a66388a 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -10,8 +10,10 @@ CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_quarantine.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_report_generic.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report_hw_tags.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_report_sw_tags.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_shadow.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_hw_tags.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_sw_tags.o = $(CC_FLAGS_FTRACE)
 
 # Function splitter causes unnecessary splits in __asan_load1/__asan_store1
@@ -27,10 +29,13 @@ CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_report_generic.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_report_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
 CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
 
 obj-$(CONFIG_KASAN) := common.o report.o
 obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
+obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o
 obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index d5f23b2f170a..02613883846e 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -118,7 +118,7 @@ void kasan_free_pages(struct page *page, unsigned int order)
  */
 static inline unsigned int optimal_redzone(unsigned int object_size)
 {
-	if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+	if (!IS_ENABLED(CONFIG_KASAN_GENERIC))
 		return 0;
 
 	return
@@ -183,14 +183,14 @@ size_t kasan_metadata_size(struct kmem_cache *cache)
 struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
 					const void *object)
 {
-	return (void *)object + cache->kasan_info.alloc_meta_offset;
+	return (void *)reset_tag(object) + cache->kasan_info.alloc_meta_offset;
 }
 
 struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
 				      const void *object)
 {
 	BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
-	return (void *)object + cache->kasan_info.free_meta_offset;
+	return (void *)reset_tag(object) + cache->kasan_info.free_meta_offset;
 }
 
 void kasan_poison_slab(struct page *page)
@@ -272,9 +272,8 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
 	alloc_info = get_alloc_info(cache, object);
 	__memset(alloc_info, 0, sizeof(*alloc_info));
 
-	if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
-		object = set_tag(object,
-				assign_tag(cache, object, true, false));
+	if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) || IS_ENABLED(CONFIG_KASAN_HW_TAGS))
+		object = set_tag(object, assign_tag(cache, object, true, false));
 
 	return (void *)object;
 }
@@ -342,10 +341,10 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
 	redzone_end = round_up((unsigned long)object + cache->object_size,
 				KASAN_GRANULE_SIZE);
 
-	if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+	if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) || IS_ENABLED(CONFIG_KASAN_HW_TAGS))
 		tag = assign_tag(cache, object, false, keep_tag);
 
-	/* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
+	/* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */
 	unpoison_range(set_tag(object, tag), size);
 	poison_range((void *)redzone_start, redzone_end - redzone_start,
 		     KASAN_KMALLOC_REDZONE);
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
new file mode 100644
index 000000000000..66419e908e21
--- /dev/null
+++ b/mm/kasan/hw_tags.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core hardware tag-based KASAN code.
+ *
+ * Copyright (c) 2020 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ */
+
+#define pr_fmt(fmt) "kasan: " fmt
+
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "kasan.h"
+
+/* kasan_init_hw_tags_cpu() is called for each CPU. */
+void kasan_init_hw_tags_cpu(void)
+{
+	hw_init_tags(KASAN_TAG_MAX);
+	hw_enable_tagging();
+}
+
+/* kasan_init_hw_tags() is called once on boot CPU. */
+void __init kasan_init_hw_tags(void)
+{
+	pr_info("KernelAddressSanitizer initialized\n");
+}
+
+void *kasan_reset_tag(const void *addr)
+{
+	return reset_tag(addr);
+}
+
+void poison_range(const void *address, size_t size, u8 value)
+{
+	hw_set_mem_tag_range(reset_tag(address),
+			round_up(size, KASAN_GRANULE_SIZE), value);
+}
+
+void unpoison_range(const void *address, size_t size)
+{
+	hw_set_mem_tag_range(reset_tag(address),
+			round_up(size, KASAN_GRANULE_SIZE), get_tag(address));
+}
+
+u8 random_tag(void)
+{
+	return hw_get_random_tag();
+}
+
+bool check_invalid_free(void *addr)
+{
+	u8 ptr_tag = get_tag(addr);
+	u8 mem_tag = hw_get_mem_tag(addr);
+
+	return (mem_tag == KASAN_TAG_INVALID) ||
+		(ptr_tag != KASAN_TAG_KERNEL && ptr_tag != mem_tag);
+}
+
+void kasan_set_free_info(struct kmem_cache *cache,
+				void *object, u8 tag)
+{
+	struct kasan_alloc_meta *alloc_meta;
+
+	alloc_meta = get_alloc_info(cache, object);
+	kasan_set_track(&alloc_meta->free_track[0], GFP_NOWAIT);
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+				void *object, u8 tag)
+{
+	struct kasan_alloc_meta *alloc_meta;
+
+	alloc_meta = get_alloc_info(cache, object);
+	return &alloc_meta->free_track[0];
+}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 92cb2c16e314..64560cc71191 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -154,6 +154,11 @@ struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
 struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
 					const void *object);
 
+void poison_range(const void *address, size_t size, u8 value);
+void unpoison_range(const void *address, size_t size);
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+
 static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
 {
 	return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
@@ -165,9 +170,6 @@ static inline bool addr_has_metadata(const void *addr)
 	return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
 }
 
-void poison_range(const void *address, size_t size, u8 value);
-void unpoison_range(const void *address, size_t size);
-
 /**
  * check_memory_region - Check memory region, and report if invalid access.
  * @addr: the accessed address
@@ -179,6 +181,15 @@ void unpoison_range(const void *address, size_t size);
 bool check_memory_region(unsigned long addr, size_t size, bool write,
 				unsigned long ret_ip);
 
+#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+static inline bool addr_has_metadata(const void *addr)
+{
+	return true;
+}
+
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
 bool check_invalid_free(void *addr);
 
 void *find_first_bad_addr(void *addr, size_t size);
@@ -215,7 +226,7 @@ static inline void quarantine_reduce(void) { }
 static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
 #endif
 
-#ifdef CONFIG_KASAN_SW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
 void print_tags(u8 addr_tag, const void *addr);
 
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
new file mode 100644
index 000000000000..da543eb832cd
--- /dev/null
+++ b/mm/kasan/report_hw_tags.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains hardware tag-based KASAN specific error reporting code.
+ *
+ * Copyright (c) 2020 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ */
+
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "kasan.h"
+
+const char *get_bug_type(struct kasan_access_info *info)
+{
+	return "invalid-access";
+}
+
+void *find_first_bad_addr(void *addr, size_t size)
+{
+	return reset_tag(addr);
+}
+
+void metadata_fetch_row(char *buffer, void *row)
+{
+	int i;
+
+	for (i = 0; i < META_BYTES_PER_ROW; i++)
+		buffer[i] = hw_get_mem_tag(row + i * KASAN_GRANULE_SIZE);
+}
+
+void print_tags(u8 addr_tag, const void *addr)
+{
+	u8 memory_tag = hw_get_mem_tag((void *)addr);
+
+	pr_err("Pointer tag: [%02x], memory tag: [%02x]\n",
+		addr_tag, memory_tag);
+}
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index add2dfe6169c..aebc44a29e83 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * This file contains tag-based KASAN specific error reporting code.
+ * This file contains software tag-based KASAN specific error reporting code.
  *
  * Copyright (c) 2014 Samsung Electronics Co., Ltd.
  * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index ba84e5106585..ac6a5f57df33 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -107,7 +107,7 @@ void unpoison_range(const void *address, size_t size)
 
 		if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
 			*shadow = tag;
-		else
+		else /* CONFIG_KASAN_GENERIC */
 			*shadow = size & KASAN_GRANULE_MASK;
 	}
 }
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 7317d5229b2b..a518483f3965 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * This file contains core tag-based KASAN code.
+ * This file contains core software tag-based KASAN code.
  *
  * Copyright (c) 2018 Google, Inc.
  * Author: Andrey Konovalov <andreyknvl@google.com>
-- 
cgit v1.2.3


From d56a9ef84bd0e1e8fba7a837ab12a4ec8476579f Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:02:42 -0800
Subject: kasan, arm64: unpoison stack only with CONFIG_KASAN_STACK

There's a config option CONFIG_KASAN_STACK that has to be enabled for
KASAN to use stack instrumentation and perform validity checks for
stack variables.

There's no need to unpoison stack when CONFIG_KASAN_STACK is not enabled.
Only call kasan_unpoison_task_stack[_below]() when CONFIG_KASAN_STACK is
enabled.

Note, that CONFIG_KASAN_STACK is an option that is currently always
defined when CONFIG_KASAN is enabled, and therefore has to be tested
with #if instead of #ifdef.

Link: https://lkml.kernel.org/r/d09dd3f8abb388da397fd11598c5edeaa83fe559.1606162397.git.andreyknvl@google.com
Link: https://linux-review.googlesource.com/id/If8a891e9fe01ea543e00b576852685afec0887e3
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/kernel/sleep.S        |  2 +-
 arch/x86/kernel/acpi/wakeup_64.S |  2 +-
 include/linux/kasan.h            | 10 ++++++----
 mm/kasan/common.c                |  2 ++
 4 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 4be7f7eed875..6bdef7362c0e 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -133,7 +133,7 @@ SYM_FUNC_START(_cpu_resume)
 	 */
 	bl	cpu_do_resume
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
 	mov	x0, sp
 	bl	kasan_unpoison_task_stack_below
 #endif
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index c8daa92f38dc..5d3a0b8fd379 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -112,7 +112,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
 	movq	pt_regs_r14(%rax), %r14
 	movq	pt_regs_r15(%rax), %r15
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
 	/*
 	 * The suspend path may have poisoned some areas deeper in the stack,
 	 * which we now need to unpoison.
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d22ec4c9c1bd..e638255ce906 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -77,8 +77,6 @@ static inline void kasan_disable_current(void) {}
 
 void kasan_unpoison_range(const void *address, size_t size);
 
-void kasan_unpoison_task_stack(struct task_struct *task);
-
 void kasan_alloc_pages(struct page *page, unsigned int order);
 void kasan_free_pages(struct page *page, unsigned int order);
 
@@ -123,8 +121,6 @@ void kasan_restore_multi_shot(bool enabled);
 
 static inline void kasan_unpoison_range(const void *address, size_t size) {}
 
-static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
-
 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
 static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 
@@ -176,6 +172,12 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 
 #endif /* CONFIG_KASAN */
 
+#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
+void kasan_unpoison_task_stack(struct task_struct *task);
+#else
+static inline void kasan_unpoison_task_stack(struct task_struct *task) {}
+#endif
+
 #ifdef CONFIG_KASAN_GENERIC
 
 void kasan_cache_shrink(struct kmem_cache *cache);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6c38fd0a9e5c..2754ce0c8334 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -63,6 +63,7 @@ void kasan_unpoison_range(const void *address, size_t size)
 	unpoison_range(address, size);
 }
 
+#if CONFIG_KASAN_STACK
 static void __kasan_unpoison_stack(struct task_struct *task, const void *sp)
 {
 	void *base = task_stack_page(task);
@@ -89,6 +90,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
 
 	unpoison_range(base, watermark - base);
 }
+#endif /* CONFIG_KASAN_STACK */
 
 void kasan_alloc_pages(struct page *page, unsigned int order)
 {
-- 
cgit v1.2.3


From c0054c565ae598073d6c27762c7d4f7de49a45d9 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:02:52 -0800
Subject: kasan: inline kasan_reset_tag for tag-based modes

Using kasan_reset_tag() currently results in a function call. As it's
called quite often from the allocator code, this leads to a noticeable
slowdown. Move it to include/linux/kasan.h and turn it into a static
inline function. Also remove the now unneeded reset_tag() internal KASAN
macro and use kasan_reset_tag() instead.

Link: https://lkml.kernel.org/r/6940383a3a9dfb416134d338d8fac97a9ebb8686.1606162397.git.andreyknvl@google.com
Link: https://linux-review.googlesource.com/id/I4d2061acfe91d480a75df00b07c22d8494ef14b5
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h     | 5 ++++-
 mm/kasan/common.c         | 6 +++---
 mm/kasan/hw_tags.c        | 9 ++-------
 mm/kasan/kasan.h          | 4 ----
 mm/kasan/report.c         | 4 ++--
 mm/kasan/report_hw_tags.c | 2 +-
 mm/kasan/report_sw_tags.c | 4 ++--
 mm/kasan/shadow.c         | 4 ++--
 mm/kasan/sw_tags.c        | 9 ++-------
 9 files changed, 18 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index e638255ce906..3bb72de94f90 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -194,7 +194,10 @@ static inline void kasan_record_aux_stack(void *ptr) {}
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
 
-void *kasan_reset_tag(const void *addr);
+static inline void *kasan_reset_tag(const void *addr)
+{
+	return (void *)arch_kasan_reset_tag(addr);
+}
 
 bool kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index b71dfe7c5059..780ec27459ab 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -179,14 +179,14 @@ size_t kasan_metadata_size(struct kmem_cache *cache)
 struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
 					      const void *object)
 {
-	return (void *)reset_tag(object) + cache->kasan_info.alloc_meta_offset;
+	return kasan_reset_tag(object) + cache->kasan_info.alloc_meta_offset;
 }
 
 struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
 					    const void *object)
 {
 	BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
-	return (void *)reset_tag(object) + cache->kasan_info.free_meta_offset;
+	return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset;
 }
 
 void kasan_poison_slab(struct page *page)
@@ -283,7 +283,7 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
 
 	tag = get_tag(object);
 	tagged_object = object;
-	object = reset_tag(object);
+	object = kasan_reset_tag(object);
 
 	if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
 	    object)) {
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index cb849c8da978..227599e54e8e 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -30,20 +30,15 @@ void __init kasan_init_hw_tags(void)
 	pr_info("KernelAddressSanitizer initialized\n");
 }
 
-void *kasan_reset_tag(const void *addr)
-{
-	return reset_tag(addr);
-}
-
 void poison_range(const void *address, size_t size, u8 value)
 {
-	hw_set_mem_tag_range(reset_tag(address),
+	hw_set_mem_tag_range(kasan_reset_tag(address),
 			round_up(size, KASAN_GRANULE_SIZE), value);
 }
 
 void unpoison_range(const void *address, size_t size)
 {
-	hw_set_mem_tag_range(reset_tag(address),
+	hw_set_mem_tag_range(kasan_reset_tag(address),
 			round_up(size, KASAN_GRANULE_SIZE), get_tag(address));
 }
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 0eab7e4cecb8..5e8cd2080369 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -248,15 +248,11 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
 	return addr;
 }
 #endif
-#ifndef arch_kasan_reset_tag
-#define arch_kasan_reset_tag(addr)	((void *)(addr))
-#endif
 #ifndef arch_kasan_get_tag
 #define arch_kasan_get_tag(addr)	0
 #endif
 
 #define set_tag(addr, tag)	((void *)arch_kasan_set_tag((addr), (tag)))
-#define reset_tag(addr)		((void *)arch_kasan_reset_tag(addr))
 #define get_tag(addr)		arch_kasan_get_tag(addr)
 
 #ifdef CONFIG_KASAN_HW_TAGS
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 85ce2cb2cd2b..00c590efdaea 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -328,7 +328,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
 	unsigned long flags;
 	u8 tag = get_tag(object);
 
-	object = reset_tag(object);
+	object = kasan_reset_tag(object);
 
 #if IS_ENABLED(CONFIG_KUNIT)
 	if (current->kunit_test)
@@ -361,7 +361,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
 	disable_trace_on_warning();
 
 	tagged_addr = (void *)addr;
-	untagged_addr = reset_tag(tagged_addr);
+	untagged_addr = kasan_reset_tag(tagged_addr);
 
 	info.access_addr = tagged_addr;
 	if (addr_has_metadata(untagged_addr))
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
index da543eb832cd..57114f0e14d1 100644
--- a/mm/kasan/report_hw_tags.c
+++ b/mm/kasan/report_hw_tags.c
@@ -22,7 +22,7 @@ const char *get_bug_type(struct kasan_access_info *info)
 
 void *find_first_bad_addr(void *addr, size_t size)
 {
-	return reset_tag(addr);
+	return kasan_reset_tag(addr);
 }
 
 void metadata_fetch_row(char *buffer, void *row)
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index 317100fd95b9..7604b46239d4 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -41,7 +41,7 @@ const char *get_bug_type(struct kasan_access_info *info)
 	int i;
 
 	tag = get_tag(info->access_addr);
-	addr = reset_tag(info->access_addr);
+	addr = kasan_reset_tag(info->access_addr);
 	page = kasan_addr_to_page(addr);
 	if (page && PageSlab(page)) {
 		cache = page->slab_cache;
@@ -72,7 +72,7 @@ const char *get_bug_type(struct kasan_access_info *info)
 void *find_first_bad_addr(void *addr, size_t size)
 {
 	u8 tag = get_tag(addr);
-	void *p = reset_tag(addr);
+	void *p = kasan_reset_tag(addr);
 	void *end = p + size;
 
 	while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index ac6a5f57df33..44a2b748f9d3 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -81,7 +81,7 @@ void poison_range(const void *address, size_t size, u8 value)
 	 * some of the callers (e.g. kasan_poison_object_data) pass tagged
 	 * addresses to this function.
 	 */
-	address = reset_tag(address);
+	address = kasan_reset_tag(address);
 
 	shadow_start = kasan_mem_to_shadow(address);
 	shadow_end = kasan_mem_to_shadow(address + size);
@@ -98,7 +98,7 @@ void unpoison_range(const void *address, size_t size)
 	 * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
 	 * addresses to this function.
 	 */
-	address = reset_tag(address);
+	address = kasan_reset_tag(address);
 
 	poison_range(address, size, tag);
 
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 6d7648cc3b98..e17de2619bbf 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -67,11 +67,6 @@ u8 random_tag(void)
 	return (u8)(state % (KASAN_TAG_MAX + 1));
 }
 
-void *kasan_reset_tag(const void *addr)
-{
-	return reset_tag(addr);
-}
-
 bool check_memory_region(unsigned long addr, size_t size, bool write,
 				unsigned long ret_ip)
 {
@@ -107,7 +102,7 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
 	if (tag == KASAN_TAG_KERNEL)
 		return true;
 
-	untagged_addr = reset_tag((const void *)addr);
+	untagged_addr = kasan_reset_tag((const void *)addr);
 	if (unlikely(untagged_addr <
 			kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
 		return !kasan_report(addr, size, write, ret_ip);
@@ -126,7 +121,7 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
 bool check_invalid_free(void *addr)
 {
 	u8 tag = get_tag(addr);
-	u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(reset_tag(addr)));
+	u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr)));
 
 	return (shadow_byte == KASAN_TAG_INVALID) ||
 		(tag != KASAN_TAG_KERNEL && tag != shadow_byte);
-- 
cgit v1.2.3


From bffe690708c8b4fdb8f0bff8ff22b347fc6c709a Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:02:59 -0800
Subject: kasan: open-code kasan_unpoison_slab

There's the external annotation kasan_unpoison_slab() that is currently
defined as static inline and uses kasan_unpoison_range(). Open-code this
function in mempool.c. Otherwise with an upcoming change this function
will result in an unnecessary function call.

Link: https://lkml.kernel.org/r/131a6694a978a9a8b150187e539eecc8bcbf759b.1606162397.git.andreyknvl@google.com
Link: https://linux-review.googlesource.com/id/Ia7c8b659f79209935cbaab3913bf7f082cc43a0e
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 6 ------
 mm/mempool.c          | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 3bb72de94f90..7350de3e9fe4 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -107,11 +107,6 @@ struct kasan_cache {
 	int free_meta_offset;
 };
 
-size_t __ksize(const void *);
-static inline void kasan_unpoison_slab(const void *ptr)
-{
-	kasan_unpoison_range(ptr, __ksize(ptr));
-}
 size_t kasan_metadata_size(struct kmem_cache *cache);
 
 bool kasan_save_enable_multi_shot(void);
@@ -167,7 +162,6 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
 	return false;
 }
 
-static inline void kasan_unpoison_slab(const void *ptr) { }
 static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 
 #endif /* CONFIG_KASAN */
diff --git a/mm/mempool.c b/mm/mempool.c
index f473cdddaff0..583a9865b181 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -112,7 +112,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
 static void kasan_unpoison_element(mempool_t *pool, void *element)
 {
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
-		kasan_unpoison_slab(element);
+		kasan_unpoison_range(element, __ksize(element));
 	else if (pool->alloc == mempool_alloc_pages)
 		kasan_alloc_pages(element, (unsigned long)pool->pool_data);
 }
-- 
cgit v1.2.3


From 34303244f2615add92076a4bf2d4f39323bde4f2 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:03:10 -0800
Subject: kasan, mm: check kasan_enabled in annotations

Declare the kasan_enabled static key in include/linux/kasan.h and in
include/linux/mm.h and check it in all kasan annotations. This allows to
avoid any slowdown caused by function calls when kasan_enabled is
disabled.

Link: https://lkml.kernel.org/r/9f90e3c0aa840dbb4833367c2335193299f69023.1606162397.git.andreyknvl@google.com
Link: https://linux-review.googlesource.com/id/I2589451d3c96c97abbcbf714baabe6161c6f153e
Co-developed-by: Vincenzo Frascino <Vincenzo.Frascino@arm.com>
Signed-off-by: Vincenzo Frascino <Vincenzo.Frascino@arm.com>
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 213 +++++++++++++++++++++++++++++++++++++++-----------
 include/linux/mm.h    |  22 ++++--
 mm/kasan/common.c     |  56 ++++++-------
 3 files changed, 210 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 7350de3e9fe4..9176849c4934 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_KASAN_H
 #define _LINUX_KASAN_H
 
+#include <linux/static_key.h>
 #include <linux/types.h>
 
 struct kmem_cache;
@@ -75,54 +76,176 @@ static inline void kasan_disable_current(void) {}
 
 #ifdef CONFIG_KASAN
 
-void kasan_unpoison_range(const void *address, size_t size);
+struct kasan_cache {
+	int alloc_meta_offset;
+	int free_meta_offset;
+};
 
-void kasan_alloc_pages(struct page *page, unsigned int order);
-void kasan_free_pages(struct page *page, unsigned int order);
+#ifdef CONFIG_KASAN_HW_TAGS
+DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled);
+static __always_inline bool kasan_enabled(void)
+{
+	return static_branch_likely(&kasan_flag_enabled);
+}
+#else
+static inline bool kasan_enabled(void)
+{
+	return true;
+}
+#endif
 
-void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
-			slab_flags_t *flags);
+void __kasan_unpoison_range(const void *addr, size_t size);
+static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
+{
+	if (kasan_enabled())
+		__kasan_unpoison_range(addr, size);
+}
 
-void kasan_poison_slab(struct page *page);
-void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
-void kasan_poison_object_data(struct kmem_cache *cache, void *object);
-void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
-					const void *object);
+void __kasan_alloc_pages(struct page *page, unsigned int order);
+static __always_inline void kasan_alloc_pages(struct page *page,
+						unsigned int order)
+{
+	if (kasan_enabled())
+		__kasan_alloc_pages(page, order);
+}
 
-void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
-						gfp_t flags);
-void kasan_kfree_large(void *ptr, unsigned long ip);
-void kasan_poison_kfree(void *ptr, unsigned long ip);
-void * __must_check kasan_kmalloc(struct kmem_cache *s, const void *object,
-					size_t size, gfp_t flags);
-void * __must_check kasan_krealloc(const void *object, size_t new_size,
-					gfp_t flags);
+void __kasan_free_pages(struct page *page, unsigned int order);
+static __always_inline void kasan_free_pages(struct page *page,
+						unsigned int order)
+{
+	if (kasan_enabled())
+		__kasan_free_pages(page, order);
+}
 
-void * __must_check kasan_slab_alloc(struct kmem_cache *s, void *object,
-					gfp_t flags);
-bool kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);
+void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+				slab_flags_t *flags);
+static __always_inline void kasan_cache_create(struct kmem_cache *cache,
+				unsigned int *size, slab_flags_t *flags)
+{
+	if (kasan_enabled())
+		__kasan_cache_create(cache, size, flags);
+}
 
-struct kasan_cache {
-	int alloc_meta_offset;
-	int free_meta_offset;
-};
+size_t __kasan_metadata_size(struct kmem_cache *cache);
+static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+	if (kasan_enabled())
+		return __kasan_metadata_size(cache);
+	return 0;
+}
+
+void __kasan_poison_slab(struct page *page);
+static __always_inline void kasan_poison_slab(struct page *page)
+{
+	if (kasan_enabled())
+		__kasan_poison_slab(page);
+}
+
+void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
+static __always_inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+							void *object)
+{
+	if (kasan_enabled())
+		__kasan_unpoison_object_data(cache, object);
+}
+
+void __kasan_poison_object_data(struct kmem_cache *cache, void *object);
+static __always_inline void kasan_poison_object_data(struct kmem_cache *cache,
+							void *object)
+{
+	if (kasan_enabled())
+		__kasan_poison_object_data(cache, object);
+}
+
+void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
+					  const void *object);
+static __always_inline void * __must_check kasan_init_slab_obj(
+				struct kmem_cache *cache, const void *object)
+{
+	if (kasan_enabled())
+		return __kasan_init_slab_obj(cache, object);
+	return (void *)object;
+}
+
+bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);
+static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object,
+						unsigned long ip)
+{
+	if (kasan_enabled())
+		return __kasan_slab_free(s, object, ip);
+	return false;
+}
+
+void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
+				       void *object, gfp_t flags);
+static __always_inline void * __must_check kasan_slab_alloc(
+				struct kmem_cache *s, void *object, gfp_t flags)
+{
+	if (kasan_enabled())
+		return __kasan_slab_alloc(s, object, flags);
+	return object;
+}
+
+void * __must_check __kasan_kmalloc(struct kmem_cache *s, const void *object,
+				    size_t size, gfp_t flags);
+static __always_inline void * __must_check kasan_kmalloc(struct kmem_cache *s,
+				const void *object, size_t size, gfp_t flags)
+{
+	if (kasan_enabled())
+		return __kasan_kmalloc(s, object, size, flags);
+	return (void *)object;
+}
 
-size_t kasan_metadata_size(struct kmem_cache *cache);
+void * __must_check __kasan_kmalloc_large(const void *ptr,
+					  size_t size, gfp_t flags);
+static __always_inline void * __must_check kasan_kmalloc_large(const void *ptr,
+						      size_t size, gfp_t flags)
+{
+	if (kasan_enabled())
+		return __kasan_kmalloc_large(ptr, size, flags);
+	return (void *)ptr;
+}
+
+void * __must_check __kasan_krealloc(const void *object,
+				     size_t new_size, gfp_t flags);
+static __always_inline void * __must_check kasan_krealloc(const void *object,
+						 size_t new_size, gfp_t flags)
+{
+	if (kasan_enabled())
+		return __kasan_krealloc(object, new_size, flags);
+	return (void *)object;
+}
+
+void __kasan_poison_kfree(void *ptr, unsigned long ip);
+static __always_inline void kasan_poison_kfree(void *ptr, unsigned long ip)
+{
+	if (kasan_enabled())
+		__kasan_poison_kfree(ptr, ip);
+}
+
+void __kasan_kfree_large(void *ptr, unsigned long ip);
+static __always_inline void kasan_kfree_large(void *ptr, unsigned long ip)
+{
+	if (kasan_enabled())
+		__kasan_kfree_large(ptr, ip);
+}
 
 bool kasan_save_enable_multi_shot(void);
 void kasan_restore_multi_shot(bool enabled);
 
 #else /* CONFIG_KASAN */
 
+static inline bool kasan_enabled(void)
+{
+	return false;
+}
 static inline void kasan_unpoison_range(const void *address, size_t size) {}
-
 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
 static inline void kasan_free_pages(struct page *page, unsigned int order) {}
-
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      unsigned int *size,
 				      slab_flags_t *flags) {}
-
+static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
 					void *object) {}
@@ -133,36 +256,32 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
 {
 	return (void *)object;
 }
-
-static inline void *kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags)
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
+				   unsigned long ip)
 {
-	return ptr;
+	return false;
+}
+static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
+				   gfp_t flags)
+{
+	return object;
 }
-static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
-static inline void kasan_poison_kfree(void *ptr, unsigned long ip) {}
 static inline void *kasan_kmalloc(struct kmem_cache *s, const void *object,
 				size_t size, gfp_t flags)
 {
 	return (void *)object;
 }
+static inline void *kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
+{
+	return (void *)ptr;
+}
 static inline void *kasan_krealloc(const void *object, size_t new_size,
 				 gfp_t flags)
 {
 	return (void *)object;
 }
-
-static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
-				   gfp_t flags)
-{
-	return object;
-}
-static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
-				   unsigned long ip)
-{
-	return false;
-}
-
-static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
+static inline void kasan_poison_kfree(void *ptr, unsigned long ip) {}
+static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
 
 #endif /* CONFIG_KASAN */
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 024ec0a00c72..5299b90a6c40 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -31,6 +31,7 @@
 #include <linux/sizes.h>
 #include <linux/sched.h>
 #include <linux/pgtable.h>
+#include <linux/kasan.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1422,22 +1423,30 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
 #endif /* CONFIG_NUMA_BALANCING */
 
 #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+
 static inline u8 page_kasan_tag(const struct page *page)
 {
-	return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
+	if (kasan_enabled())
+		return (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
+	return 0xff;
 }
 
 static inline void page_kasan_tag_set(struct page *page, u8 tag)
 {
-	page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
-	page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+	if (kasan_enabled()) {
+		page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
+		page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
+	}
 }
 
 static inline void page_kasan_tag_reset(struct page *page)
 {
-	page_kasan_tag_set(page, 0xff);
+	if (kasan_enabled())
+		page_kasan_tag_set(page, 0xff);
 }
-#else
+
+#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
 static inline u8 page_kasan_tag(const struct page *page)
 {
 	return 0xff;
@@ -1445,7 +1454,8 @@ static inline u8 page_kasan_tag(const struct page *page)
 
 static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
 static inline void page_kasan_tag_reset(struct page *page) { }
-#endif
+
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
 
 static inline struct zone *page_zone(const struct page *page)
 {
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 219c2979bd3e..ae0130cf9de3 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -58,7 +58,7 @@ void kasan_disable_current(void)
 }
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
-void kasan_unpoison_range(const void *address, size_t size)
+void __kasan_unpoison_range(const void *address, size_t size)
 {
 	unpoison_range(address, size);
 }
@@ -86,7 +86,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
 }
 #endif /* CONFIG_KASAN_STACK */
 
-void kasan_alloc_pages(struct page *page, unsigned int order)
+void __kasan_alloc_pages(struct page *page, unsigned int order)
 {
 	u8 tag;
 	unsigned long i;
@@ -100,7 +100,7 @@ void kasan_alloc_pages(struct page *page, unsigned int order)
 	unpoison_range(page_address(page), PAGE_SIZE << order);
 }
 
-void kasan_free_pages(struct page *page, unsigned int order)
+void __kasan_free_pages(struct page *page, unsigned int order)
 {
 	if (likely(!PageHighMem(page)))
 		poison_range(page_address(page),
@@ -127,8 +127,8 @@ static inline unsigned int optimal_redzone(unsigned int object_size)
 		object_size <= (1 << 16) - 1024 ? 1024 : 2048;
 }
 
-void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
-			slab_flags_t *flags)
+void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+			  slab_flags_t *flags)
 {
 	unsigned int orig_size = *size;
 	unsigned int redzone_size;
@@ -173,7 +173,7 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
 	*flags |= SLAB_KASAN;
 }
 
-size_t kasan_metadata_size(struct kmem_cache *cache)
+size_t __kasan_metadata_size(struct kmem_cache *cache)
 {
 	if (!kasan_stack_collection_enabled())
 		return 0;
@@ -196,7 +196,7 @@ struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
 	return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset;
 }
 
-void kasan_poison_slab(struct page *page)
+void __kasan_poison_slab(struct page *page)
 {
 	unsigned long i;
 
@@ -206,12 +206,12 @@ void kasan_poison_slab(struct page *page)
 		     KASAN_KMALLOC_REDZONE);
 }
 
-void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
 {
 	unpoison_range(object, cache->object_size);
 }
 
-void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
 {
 	poison_range(object,
 			round_up(cache->object_size, KASAN_GRANULE_SIZE),
@@ -264,7 +264,7 @@ static u8 assign_tag(struct kmem_cache *cache, const void *object,
 #endif
 }
 
-void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
+void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
 						const void *object)
 {
 	struct kasan_alloc_meta *alloc_meta;
@@ -283,7 +283,7 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
 	return (void *)object;
 }
 
-static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
+static bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
 			      unsigned long ip, bool quarantine)
 {
 	u8 tag;
@@ -326,9 +326,9 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
 	return IS_ENABLED(CONFIG_KASAN_GENERIC);
 }
 
-bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
 {
-	return __kasan_slab_free(cache, object, ip, true);
+	return ____kasan_slab_free(cache, object, ip, true);
 }
 
 static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
@@ -336,7 +336,7 @@ static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
 	kasan_set_track(&kasan_get_alloc_meta(cache, object)->alloc_track, flags);
 }
 
-static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
+static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
 				size_t size, gfp_t flags, bool keep_tag)
 {
 	unsigned long redzone_start;
@@ -368,20 +368,20 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
 	return set_tag(object, tag);
 }
 
-void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
-					gfp_t flags)
+void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
+					void *object, gfp_t flags)
 {
-	return __kasan_kmalloc(cache, object, cache->object_size, flags, false);
+	return ____kasan_kmalloc(cache, object, cache->object_size, flags, false);
 }
 
-void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
-				size_t size, gfp_t flags)
+void * __must_check __kasan_kmalloc(struct kmem_cache *cache, const void *object,
+					size_t size, gfp_t flags)
 {
-	return __kasan_kmalloc(cache, object, size, flags, true);
+	return ____kasan_kmalloc(cache, object, size, flags, true);
 }
-EXPORT_SYMBOL(kasan_kmalloc);
+EXPORT_SYMBOL(__kasan_kmalloc);
 
-void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
+void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
 						gfp_t flags)
 {
 	struct page *page;
@@ -406,7 +406,7 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
 	return (void *)ptr;
 }
 
-void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
+void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flags)
 {
 	struct page *page;
 
@@ -416,13 +416,13 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
 	page = virt_to_head_page(object);
 
 	if (unlikely(!PageSlab(page)))
-		return kasan_kmalloc_large(object, size, flags);
+		return __kasan_kmalloc_large(object, size, flags);
 	else
-		return __kasan_kmalloc(page->slab_cache, object, size,
+		return ____kasan_kmalloc(page->slab_cache, object, size,
 						flags, true);
 }
 
-void kasan_poison_kfree(void *ptr, unsigned long ip)
+void __kasan_poison_kfree(void *ptr, unsigned long ip)
 {
 	struct page *page;
 
@@ -435,11 +435,11 @@ void kasan_poison_kfree(void *ptr, unsigned long ip)
 		}
 		poison_range(ptr, page_size(page), KASAN_FREE_PAGE);
 	} else {
-		__kasan_slab_free(page->slab_cache, ptr, ip, false);
+		____kasan_slab_free(page->slab_cache, ptr, ip, false);
 	}
 }
 
-void kasan_kfree_large(void *ptr, unsigned long ip)
+void __kasan_kfree_large(void *ptr, unsigned long ip)
 {
 	if (ptr != page_address(virt_to_head_page(ptr)))
 		kasan_report_invalid_free(ptr, ip);
-- 
cgit v1.2.3


From eeb3160c2419e0f1045537acac7b19cba64112f4 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:03:13 -0800
Subject: kasan, mm: rename kasan_poison_kfree

Rename kasan_poison_kfree() to kasan_slab_free_mempool() as it better
reflects what this annotation does. Also add a comment that explains the
PageSlab() check.

No functional changes.

Link: https://lkml.kernel.org/r/141675fb493555e984c5dca555e9d9f768c7bbaa.1606162397.git.andreyknvl@google.com
Link: https://linux-review.googlesource.com/id/I5026f87364e556b506ef1baee725144bb04b8810
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 16 ++++++++--------
 mm/kasan/common.c     | 40 +++++++++++++++++++++++-----------------
 mm/mempool.c          |  2 +-
 3 files changed, 32 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 9176849c4934..6f0c5d9aa43f 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -176,6 +176,13 @@ static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object,
 	return false;
 }
 
+void __kasan_slab_free_mempool(void *ptr, unsigned long ip);
+static __always_inline void kasan_slab_free_mempool(void *ptr, unsigned long ip)
+{
+	if (kasan_enabled())
+		__kasan_slab_free_mempool(ptr, ip);
+}
+
 void * __must_check __kasan_slab_alloc(struct kmem_cache *s,
 				       void *object, gfp_t flags);
 static __always_inline void * __must_check kasan_slab_alloc(
@@ -216,13 +223,6 @@ static __always_inline void * __must_check kasan_krealloc(const void *object,
 	return (void *)object;
 }
 
-void __kasan_poison_kfree(void *ptr, unsigned long ip);
-static __always_inline void kasan_poison_kfree(void *ptr, unsigned long ip)
-{
-	if (kasan_enabled())
-		__kasan_poison_kfree(ptr, ip);
-}
-
 void __kasan_kfree_large(void *ptr, unsigned long ip);
 static __always_inline void kasan_kfree_large(void *ptr, unsigned long ip)
 {
@@ -261,6 +261,7 @@ static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
 {
 	return false;
 }
+static inline void kasan_slab_free_mempool(void *ptr, unsigned long ip) {}
 static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags)
 {
@@ -280,7 +281,6 @@ static inline void *kasan_krealloc(const void *object, size_t new_size,
 {
 	return (void *)object;
 }
-static inline void kasan_poison_kfree(void *ptr, unsigned long ip) {}
 static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
 
 #endif /* CONFIG_KASAN */
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index ae0130cf9de3..d0f8d7a955cd 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -331,6 +331,29 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
 	return ____kasan_slab_free(cache, object, ip, true);
 }
 
+void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
+{
+	struct page *page;
+
+	page = virt_to_head_page(ptr);
+
+	/*
+	 * Even though this function is only called for kmem_cache_alloc and
+	 * kmalloc backed mempool allocations, those allocations can still be
+	 * !PageSlab() when the size provided to kmalloc is larger than
+	 * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
+	 */
+	if (unlikely(!PageSlab(page))) {
+		if (ptr != page_address(page)) {
+			kasan_report_invalid_free(ptr, ip);
+			return;
+		}
+		poison_range(ptr, page_size(page), KASAN_FREE_PAGE);
+	} else {
+		____kasan_slab_free(page->slab_cache, ptr, ip, false);
+	}
+}
+
 static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
 {
 	kasan_set_track(&kasan_get_alloc_meta(cache, object)->alloc_track, flags);
@@ -422,23 +445,6 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
 						flags, true);
 }
 
-void __kasan_poison_kfree(void *ptr, unsigned long ip)
-{
-	struct page *page;
-
-	page = virt_to_head_page(ptr);
-
-	if (unlikely(!PageSlab(page))) {
-		if (ptr != page_address(page)) {
-			kasan_report_invalid_free(ptr, ip);
-			return;
-		}
-		poison_range(ptr, page_size(page), KASAN_FREE_PAGE);
-	} else {
-		____kasan_slab_free(page->slab_cache, ptr, ip, false);
-	}
-}
-
 void __kasan_kfree_large(void *ptr, unsigned long ip)
 {
 	if (ptr != page_address(virt_to_head_page(ptr)))
diff --git a/mm/mempool.c b/mm/mempool.c
index 583a9865b181..624ed51b060f 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -104,7 +104,7 @@ static inline void poison_element(mempool_t *pool, void *element)
 static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
 {
 	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
-		kasan_poison_kfree(element, _RET_IP_);
+		kasan_slab_free_mempool(element, _RET_IP_);
 	else if (pool->alloc == mempool_alloc_pages)
 		kasan_free_pages(element, (unsigned long)pool->pool_data);
 }
-- 
cgit v1.2.3


From e86f8b09f215e3755cd2d56930487dec2de02433 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 22 Dec 2020 12:03:31 -0800
Subject: kasan, mm: allow cache merging with no metadata

The reason cache merging is disabled with KASAN is because KASAN puts its
metadata right after the allocated object. When the merged caches have
slightly different sizes, the metadata ends up in different places, which
KASAN doesn't support.

It might be possible to adjust the metadata allocation algorithm and make
it friendly to the cache merging code. Instead this change takes a simpler
approach and allows merging caches when no metadata is present. Which is
the case for hardware tag-based KASAN with kasan.mode=prod.

Link: https://lkml.kernel.org/r/37497e940bfd4b32c0a93a702a9ae4cf061d5392.1606162397.git.andreyknvl@google.com
Link: https://linux-review.googlesource.com/id/Ia114847dfb2244f297d2cb82d592bf6a07455dba
Co-developed-by: Vincenzo Frascino <Vincenzo.Frascino@arm.com>
Signed-off-by: Vincenzo Frascino <Vincenzo.Frascino@arm.com>
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Tested-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Branislav Rankov <Branislav.Rankov@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Evgenii Stepanov <eugenis@google.com>
Cc: Kevin Brodsky <kevin.brodsky@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 21 +++++++++++++++++++--
 mm/kasan/common.c     | 11 +++++++++++
 mm/slab_common.c      |  3 ++-
 3 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 6f0c5d9aa43f..5e0655fb2a6f 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -82,17 +82,30 @@ struct kasan_cache {
 };
 
 #ifdef CONFIG_KASAN_HW_TAGS
+
 DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled);
+
 static __always_inline bool kasan_enabled(void)
 {
 	return static_branch_likely(&kasan_flag_enabled);
 }
-#else
+
+#else /* CONFIG_KASAN_HW_TAGS */
+
 static inline bool kasan_enabled(void)
 {
 	return true;
 }
-#endif
+
+#endif /* CONFIG_KASAN_HW_TAGS */
+
+slab_flags_t __kasan_never_merge(void);
+static __always_inline slab_flags_t kasan_never_merge(void)
+{
+	if (kasan_enabled())
+		return __kasan_never_merge();
+	return 0;
+}
 
 void __kasan_unpoison_range(const void *addr, size_t size);
 static __always_inline void kasan_unpoison_range(const void *addr, size_t size)
@@ -239,6 +252,10 @@ static inline bool kasan_enabled(void)
 {
 	return false;
 }
+static inline slab_flags_t kasan_never_merge(void)
+{
+	return 0;
+}
 static inline void kasan_unpoison_range(const void *address, size_t size) {}
 static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
 static inline void kasan_free_pages(struct page *page, unsigned int order) {}
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 0cd583d2fe1c..b25167664ead 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -86,6 +86,17 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
 }
 #endif /* CONFIG_KASAN_STACK */
 
+/*
+ * Only allow cache merging when stack collection is disabled and no metadata
+ * is present.
+ */
+slab_flags_t __kasan_never_merge(void)
+{
+	if (kasan_stack_collection_enabled())
+		return SLAB_KASAN;
+	return 0;
+}
+
 void __kasan_alloc_pages(struct page *page, unsigned int order)
 {
 	u8 tag;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 573fbacd9ef5..e981c80d216c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -18,6 +18,7 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
+#include <linux/kasan.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
@@ -53,7 +54,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
  */
 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 		SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
-		SLAB_FAILSLAB | SLAB_KASAN)
+		SLAB_FAILSLAB | kasan_never_merge())
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
 			 SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
-- 
cgit v1.2.3


From 2ca408d9c749c32288bc28725f9f12ba30299e8f Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 30 Nov 2020 17:30:59 -0500
Subject: fanotify: Fix sys_fanotify_mark() on native x86-32
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit

  121b32a58a3a ("x86/entry/32: Use IA32-specific wrappers for syscalls taking 64-bit arguments")

converted native x86-32 which take 64-bit arguments to use the
compat handlers to allow conversion to passing args via pt_regs.
sys_fanotify_mark() was however missed, as it has a general compat
handler. Add a config option that will use the syscall wrapper that
takes the split args for native 32-bit.

 [ bp: Fix typo in Kconfig help text. ]

Fixes: 121b32a58a3a ("x86/entry/32: Use IA32-specific wrappers for syscalls taking 64-bit arguments")
Reported-by: Paweł Jasiak <pawel@jasiak.xyz>
Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Andy Lutomirski <luto@kernel.org>
Link: https://lkml.kernel.org/r/20201130223059.101286-1-brgerst@gmail.com
---
 arch/Kconfig                       |  6 ++++++
 arch/x86/Kconfig                   |  1 +
 fs/notify/fanotify/fanotify_user.c | 17 +++++++----------
 include/linux/syscalls.h           | 24 ++++++++++++++++++++++++
 4 files changed, 38 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 78c6f05b10f9..24862d15f3a3 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1105,6 +1105,12 @@ config HAVE_ARCH_PFN_VALID
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	bool
 
+config ARCH_SPLIT_ARG64
+	bool
+	help
+	   If a 32-bit architecture requires 64-bit arguments to be split into
+	   pairs of 32-bit arguments, select this option.
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7b6dd10b162a..21f851179ff0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -19,6 +19,7 @@ config X86_32
 	select KMAP_LOCAL
 	select MODULES_USE_ELF_REL
 	select OLD_SIGACTION
+	select ARCH_SPLIT_ARG64
 
 config X86_64
 	def_bool y
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 3e01d8f2ab90..dcab112e1f00 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1285,26 +1285,23 @@ fput_and_out:
 	return ret;
 }
 
+#ifndef CONFIG_ARCH_SPLIT_ARG64
 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
 			      __u64, mask, int, dfd,
 			      const char  __user *, pathname)
 {
 	return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
 }
+#endif
 
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE6(fanotify_mark,
+#if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
+SYSCALL32_DEFINE6(fanotify_mark,
 				int, fanotify_fd, unsigned int, flags,
-				__u32, mask0, __u32, mask1, int, dfd,
+				SC_ARG64(mask), int, dfd,
 				const char  __user *, pathname)
 {
-	return do_fanotify_mark(fanotify_fd, flags,
-#ifdef __BIG_ENDIAN
-				((__u64)mask0 << 32) | mask1,
-#else
-				((__u64)mask1 << 32) | mask0,
-#endif
-				 dfd, pathname);
+	return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
+				dfd, pathname);
 }
 #endif
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f3929aff39cf..7688bc983de5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -251,6 +251,30 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
 	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 #endif /* __SYSCALL_DEFINEx */
 
+/* For split 64-bit arguments on 32-bit architectures */
+#ifdef __LITTLE_ENDIAN
+#define SC_ARG64(name) u32, name##_lo, u32, name##_hi
+#else
+#define SC_ARG64(name) u32, name##_hi, u32, name##_lo
+#endif
+#define SC_VAL64(type, name) ((type) name##_hi << 32 | name##_lo)
+
+#ifdef CONFIG_COMPAT
+#define SYSCALL32_DEFINE1 COMPAT_SYSCALL_DEFINE1
+#define SYSCALL32_DEFINE2 COMPAT_SYSCALL_DEFINE2
+#define SYSCALL32_DEFINE3 COMPAT_SYSCALL_DEFINE3
+#define SYSCALL32_DEFINE4 COMPAT_SYSCALL_DEFINE4
+#define SYSCALL32_DEFINE5 COMPAT_SYSCALL_DEFINE5
+#define SYSCALL32_DEFINE6 COMPAT_SYSCALL_DEFINE6
+#else
+#define SYSCALL32_DEFINE1 SYSCALL_DEFINE1
+#define SYSCALL32_DEFINE2 SYSCALL_DEFINE2
+#define SYSCALL32_DEFINE3 SYSCALL_DEFINE3
+#define SYSCALL32_DEFINE4 SYSCALL_DEFINE4
+#define SYSCALL32_DEFINE5 SYSCALL_DEFINE5
+#define SYSCALL32_DEFINE6 SYSCALL_DEFINE6
+#endif
+
 /*
  * Called before coming back to user-mode. Returning to user-mode with an
  * address limit different than USER_DS can allow to overwrite kernel memory.
-- 
cgit v1.2.3


From 664f1e259a982bf213f0cd8eea7616c89546585c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 16 Dec 2020 18:40:05 +0100
Subject: libceph: add __maybe_unused to DEFINE_MSGR2_FEATURE

Avoid -Wunused-const-variable warnings for "make W=1".

Reported-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/msgr.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index f5e02f6c0655..3989dcb94d3d 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -33,8 +33,8 @@
 #define CEPH_MSGR2_INCARNATION_1 (0ull)
 
 #define DEFINE_MSGR2_FEATURE(bit, incarnation, name)               \
-	static const uint64_t CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \
-	static const uint64_t CEPH_MSGR2_FEATUREMASK_##name =            \
+	static const uint64_t __maybe_unused CEPH_MSGR2_FEATURE_##name = (1ULL << bit); \
+	static const uint64_t __maybe_unused CEPH_MSGR2_FEATUREMASK_##name =            \
 			(1ULL << bit | CEPH_MSGR2_INCARNATION_##incarnation);
 
 #define HAVE_MSGR2_FEATURE(x, name) \
-- 
cgit v1.2.3


From 3a176b94609a18f5f8bac7ddbf8923bd737262db Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Tue, 29 Dec 2020 15:14:28 -0800
Subject: Revert "kbuild: avoid static_assert for genksyms"

This reverts commit 14dc3983b5dff513a90bd5a8cc90acaf7867c3d0.

Macro Elver had sent a fix proper fix earlier, and also pointed out
corner cases:
 "I guess what you propose is simpler, but might still have corner cases
  where we still get warnings. In particular, if some file (for whatever
  reason) does not include build_bug.h and uses a raw _Static_assert(),
  then we still get warnings. E.g. I see 1 user of raw _Static_assert()
  (drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h )."

I believe the raw use of _Static_assert() should be allowed, so this
should be fixed in genksyms.

Even after commit 14dc3983b5df ("kbuild: avoid static_assert for
genksyms"), I confirmed the following test code emits the warning.

  ---------------->8----------------
  #include <linux/export.h>

  _Static_assert((1 ?: 0), "");

  void foo(void) { }
  EXPORT_SYMBOL(foo);
  ---------------->8----------------

  WARNING: modpost: EXPORT symbol "foo" [vmlinux] version generation failed, symbol will not be versioned.

Now that commit 869b91992bce ("genksyms: Ignore module scoped
_Static_assert()") fixed this issue properly, the workaround should
be reverted.

Link: https://lkml.org/lkml/2020/12/10/845
Link: https://lkml.kernel.org/r/20201219183911.181442-1-masahiroy@kernel.org
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Cc: Marco Elver <elver@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/build_bug.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/build_bug.h b/include/linux/build_bug.h
index 7bb66e15b481..e3a0be2c90ad 100644
--- a/include/linux/build_bug.h
+++ b/include/linux/build_bug.h
@@ -77,9 +77,4 @@
 #define static_assert(expr, ...) __static_assert(expr, ##__VA_ARGS__, #expr)
 #define __static_assert(expr, msg, ...) _Static_assert(expr, msg)
 
-#ifdef __GENKSYMS__
-/* genksyms gets confused by _Static_assert */
-#define _Static_assert(expr, ...)
-#endif
-
 #endif	/* _LINUX_BUILD_BUG_H */
-- 
cgit v1.2.3


From 6d87d0ece58bc0022ca5247721a8eb06ef66b673 Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Tue, 29 Dec 2020 15:14:34 -0800
Subject: mm: add prototype for __add_to_page_cache_locked()

Otherwise it causes a gcc warning:

  mm/filemap.c:830:14: warning: no previous prototype for `__add_to_page_cache_locked' [-Wmissing-prototypes]

A previous attempt to make this function static led to compilation
errors when CONFIG_DEBUG_INFO_BTF is enabled because
__add_to_page_cache_locked() is referred to by BPF code.

Adding a prototype will silence the warning.

Link: https://lkml.kernel.org/r/1608693702-4665-1-git-send-email-jrdr.linux@gmail.com
Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Alex Shi <alex.shi@linux.alibaba.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5299b90a6c40..c1e908184442 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -216,6 +216,13 @@ int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
 int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *,
 		loff_t *);
+/*
+ * Any attempt to mark this function as static leads to build failure
+ * when CONFIG_DEBUG_INFO_BTF is enabled because __add_to_page_cache_locked()
+ * is referred to by BPF code. This must be visible for error injection.
+ */
+int __add_to_page_cache_locked(struct page *page, struct address_space *mapping,
+		pgoff_t index, gfp_t gfp, void **shadowp);
 
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 
-- 
cgit v1.2.3


From dc2da7b45ffe954a0090f5d0310ed7b0b37d2bd2 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 29 Dec 2020 15:14:37 -0800
Subject: mm: memmap defer init doesn't work as expected

VMware observed a performance regression during memmap init on their
platform, and bisected to commit 73a6e474cb376 ("mm: memmap_init:
iterate over memblock regions rather that check each PFN") causing it.

Before the commit:

  [0.033176] Normal zone: 1445888 pages used for memmap
  [0.033176] Normal zone: 89391104 pages, LIFO batch:63
  [0.035851] ACPI: PM-Timer IO Port: 0x448

With commit

  [0.026874] Normal zone: 1445888 pages used for memmap
  [0.026875] Normal zone: 89391104 pages, LIFO batch:63
  [2.028450] ACPI: PM-Timer IO Port: 0x448

The root cause is the current memmap defer init doesn't work as expected.

Before, memmap_init_zone() was used to do memmap init of one whole zone,
to initialize all low zones of one numa node, but defer memmap init of
the last zone in that numa node.  However, since commit 73a6e474cb376,
function memmap_init() is adapted to iterater over memblock regions
inside one zone, then call memmap_init_zone() to do memmap init for each
region.

E.g, on VMware's system, the memory layout is as below, there are two
memory regions in node 2.  The current code will mistakenly initialize the
whole 1st region [mem 0xab00000000-0xfcffffffff], then do memmap defer to
iniatialize only one memmory section on the 2nd region [mem
0x10000000000-0x1033fffffff].  In fact, we only expect to see that there's
only one memory section's memmap initialized.  That's why more time is
costed at the time.

[    0.008842] ACPI: SRAT: Node 0 PXM 0 [mem 0x00000000-0x0009ffff]
[    0.008842] ACPI: SRAT: Node 0 PXM 0 [mem 0x00100000-0xbfffffff]
[    0.008843] ACPI: SRAT: Node 0 PXM 0 [mem 0x100000000-0x55ffffffff]
[    0.008844] ACPI: SRAT: Node 1 PXM 1 [mem 0x5600000000-0xaaffffffff]
[    0.008844] ACPI: SRAT: Node 2 PXM 2 [mem 0xab00000000-0xfcffffffff]
[    0.008845] ACPI: SRAT: Node 2 PXM 2 [mem 0x10000000000-0x1033fffffff]

Now, let's add a parameter 'zone_end_pfn' to memmap_init_zone() to pass
down the real zone end pfn so that defer_init() can use it to judge
whether defer need be taken in zone wide.

Link: https://lkml.kernel.org/r/20201223080811.16211-1-bhe@redhat.com
Link: https://lkml.kernel.org/r/20201223080811.16211-2-bhe@redhat.com
Fixes: commit 73a6e474cb376 ("mm: memmap_init: iterate over memblock regions rather that check each PFN")
Signed-off-by: Baoquan He <bhe@redhat.com>
Reported-by: Rahul Gopakumar <gopakumarr@vmware.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c | 4 ++--
 include/linux/mm.h  | 5 +++--
 mm/memory_hotplug.c | 2 +-
 mm/page_alloc.c     | 8 +++++---
 4 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 9b5acf8fb092..e76386a3479e 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -536,7 +536,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg)
 
 	if (map_start < map_end)
 		memmap_init_zone((unsigned long)(map_end - map_start),
-				 args->nid, args->zone, page_to_pfn(map_start),
+				 args->nid, args->zone, page_to_pfn(map_start), page_to_pfn(map_end),
 				 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 	return 0;
 }
@@ -546,7 +546,7 @@ memmap_init (unsigned long size, int nid, unsigned long zone,
 	     unsigned long start_pfn)
 {
 	if (!vmem_map) {
-		memmap_init_zone(size, nid, zone, start_pfn,
+		memmap_init_zone(size, nid, zone, start_pfn, start_pfn + size,
 				 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 	} else {
 		struct page *start;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c1e908184442..ecdf8a8cd6ae 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2439,8 +2439,9 @@ extern int __meminit early_pfn_to_nid(unsigned long pfn);
 #endif
 
 extern void set_dma_reserve(unsigned long new_dma_reserve);
-extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
-		enum meminit_context, struct vmem_altmap *, int migratetype);
+extern void memmap_init_zone(unsigned long, int, unsigned long,
+		unsigned long, unsigned long, enum meminit_context,
+		struct vmem_altmap *, int migratetype);
 extern void setup_per_zone_wmarks(void);
 extern int __meminit init_per_zone_wmark_min(void);
 extern void mem_init(void);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index af41fb990820..f9d57b9be8c7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -713,7 +713,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 	 * expects the zone spans the pfn range. All the pages in the range
 	 * are reserved so nobody should be touching them so we should be safe
 	 */
-	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
+	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
 			 MEMINIT_HOTPLUG, altmap, migratetype);
 
 	set_zone_contiguous(zone);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7a2c89b21115..bdbec4c98173 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -423,6 +423,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
 	if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
 		return false;
 
+	if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
+		return true;
 	/*
 	 * We start only with one section of pages, more pages are added as
 	 * needed until the rest of deferred pages are initialized.
@@ -6116,7 +6118,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
  * zone stats (e.g., nr_isolate_pageblock) are touched.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
-		unsigned long start_pfn,
+		unsigned long start_pfn, unsigned long zone_end_pfn,
 		enum meminit_context context,
 		struct vmem_altmap *altmap, int migratetype)
 {
@@ -6152,7 +6154,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		if (context == MEMINIT_EARLY) {
 			if (overlap_memmap_init(zone, &pfn))
 				continue;
-			if (defer_init(nid, pfn, end_pfn))
+			if (defer_init(nid, pfn, zone_end_pfn))
 				break;
 		}
 
@@ -6266,7 +6268,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid,
 
 		if (end_pfn > start_pfn) {
 			size = end_pfn - start_pfn;
-			memmap_init_zone(size, nid, zone, start_pfn,
+			memmap_init_zone(size, nid, zone, start_pfn, range_end_pfn,
 					 MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
 		}
 	}
-- 
cgit v1.2.3


From 8b0fac44bd1ff17016502b3c3533f5abb8456c65 Mon Sep 17 00:00:00 2001
From: Huang Shijie <sjhuang@iluvatar.ai>
Date: Tue, 29 Dec 2020 15:14:52 -0800
Subject: sizes.h: add SZ_8G/SZ_16G/SZ_32G macros

Add these macros, since we can use them in drivers.

Link: https://lkml.kernel.org/r/20201229072819.11183-1-sjhuang@iluvatar.ai
Signed-off-by: Huang Shijie <sjhuang@iluvatar.ai>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sizes.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sizes.h b/include/linux/sizes.h
index 9874f6f67537..1ac79bcee2bb 100644
--- a/include/linux/sizes.h
+++ b/include/linux/sizes.h
@@ -44,6 +44,9 @@
 #define SZ_2G				0x80000000
 
 #define SZ_4G				_AC(0x100000000, ULL)
+#define SZ_8G				_AC(0x200000000, ULL)
+#define SZ_16G				_AC(0x400000000, ULL)
+#define SZ_32G				_AC(0x800000000, ULL)
 #define SZ_64T				_AC(0x400000000000, ULL)
 
 #endif /* __LINUX_SIZES_H__ */
-- 
cgit v1.2.3


From aa8c7db494d0a83ecae583aa193f1134ef25d506 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 29 Dec 2020 15:14:55 -0800
Subject: kdev_t: always inline major/minor helper functions

Silly GCC doesn't always inline these trivial functions.

Fixes the following warning:

  arch/x86/kernel/sys_ia32.o: warning: objtool: cp_stat64()+0xd8: call to new_encode_dev() with UACCESS enabled

Link: https://lkml.kernel.org/r/984353b44a4484d86ba9f73884b7306232e25e30.1608737428.git.jpoimboe@redhat.com
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>	[build-tested]
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kdev_t.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h
index 85b5151911cf..4856706fbfeb 100644
--- a/include/linux/kdev_t.h
+++ b/include/linux/kdev_t.h
@@ -21,61 +21,61 @@
 	})
 
 /* acceptable for old filesystems */
-static inline bool old_valid_dev(dev_t dev)
+static __always_inline bool old_valid_dev(dev_t dev)
 {
 	return MAJOR(dev) < 256 && MINOR(dev) < 256;
 }
 
-static inline u16 old_encode_dev(dev_t dev)
+static __always_inline u16 old_encode_dev(dev_t dev)
 {
 	return (MAJOR(dev) << 8) | MINOR(dev);
 }
 
-static inline dev_t old_decode_dev(u16 val)
+static __always_inline dev_t old_decode_dev(u16 val)
 {
 	return MKDEV((val >> 8) & 255, val & 255);
 }
 
-static inline u32 new_encode_dev(dev_t dev)
+static __always_inline u32 new_encode_dev(dev_t dev)
 {
 	unsigned major = MAJOR(dev);
 	unsigned minor = MINOR(dev);
 	return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12);
 }
 
-static inline dev_t new_decode_dev(u32 dev)
+static __always_inline dev_t new_decode_dev(u32 dev)
 {
 	unsigned major = (dev & 0xfff00) >> 8;
 	unsigned minor = (dev & 0xff) | ((dev >> 12) & 0xfff00);
 	return MKDEV(major, minor);
 }
 
-static inline u64 huge_encode_dev(dev_t dev)
+static __always_inline u64 huge_encode_dev(dev_t dev)
 {
 	return new_encode_dev(dev);
 }
 
-static inline dev_t huge_decode_dev(u64 dev)
+static __always_inline dev_t huge_decode_dev(u64 dev)
 {
 	return new_decode_dev(dev);
 }
 
-static inline int sysv_valid_dev(dev_t dev)
+static __always_inline int sysv_valid_dev(dev_t dev)
 {
 	return MAJOR(dev) < (1<<14) && MINOR(dev) < (1<<18);
 }
 
-static inline u32 sysv_encode_dev(dev_t dev)
+static __always_inline u32 sysv_encode_dev(dev_t dev)
 {
 	return MINOR(dev) | (MAJOR(dev) << 18);
 }
 
-static inline unsigned sysv_major(u32 dev)
+static __always_inline unsigned sysv_major(u32 dev)
 {
 	return (dev >> 18) & 0x3fff;
 }
 
-static inline unsigned sysv_minor(u32 dev)
+static __always_inline unsigned sysv_minor(u32 dev)
 {
 	return dev & 0x3ffff;
 }
-- 
cgit v1.2.3


From e89eed02a5f1b864fa5abafc8e8e71bd9fd66d1f Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@google.com>
Date: Tue, 5 Jan 2021 20:53:42 +0100
Subject: kcov, usb: hide in_serving_softirq checks in __usb_hcd_giveback_urb

Done opencode in_serving_softirq() checks in in_serving_softirq() to avoid
cluttering the code, hide them in kcov helpers instead.

Fixes: aee9ddb1d371 ("kcov, usb: only collect coverage from __usb_hcd_giveback_urb in softirq")
Signed-off-by: Andrey Konovalov <andreyknvl@google.com>
Link: https://lore.kernel.org/r/aeb430c5bb90b0ccdf1ec302c70831c1a47b9c45.1609876340.git.andreyknvl@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c |  8 +++-----
 include/linux/kcov.h   | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 60886a7464c3..ad5a0f405a75 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1649,14 +1649,12 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
 	urb->status = status;
 	/*
 	 * This function can be called in task context inside another remote
-	 * coverage collection section, but KCOV doesn't support that kind of
+	 * coverage collection section, but kcov doesn't support that kind of
 	 * recursion yet. Only collect coverage in softirq context for now.
 	 */
-	if (in_serving_softirq())
-		kcov_remote_start_usb((u64)urb->dev->bus->busnum);
+	kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum);
 	urb->complete(urb);
-	if (in_serving_softirq())
-		kcov_remote_stop();
+	kcov_remote_stop_softirq();
 
 	usb_anchor_resume_wakeups(anchor);
 	atomic_dec(&urb->use_count);
diff --git a/include/linux/kcov.h b/include/linux/kcov.h
index a10e84707d82..4e3037dc1204 100644
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -52,6 +52,25 @@ static inline void kcov_remote_start_usb(u64 id)
 	kcov_remote_start(kcov_remote_handle(KCOV_SUBSYSTEM_USB, id));
 }
 
+/*
+ * The softirq flavor of kcov_remote_*() functions is introduced as a temporary
+ * work around for kcov's lack of nested remote coverage sections support in
+ * task context. Adding suport for nested sections is tracked in:
+ * https://bugzilla.kernel.org/show_bug.cgi?id=210337
+ */
+
+static inline void kcov_remote_start_usb_softirq(u64 id)
+{
+	if (in_serving_softirq())
+		kcov_remote_start_usb(id);
+}
+
+static inline void kcov_remote_stop_softirq(void)
+{
+	if (in_serving_softirq())
+		kcov_remote_stop();
+}
+
 #else
 
 static inline void kcov_task_init(struct task_struct *t) {}
@@ -66,6 +85,8 @@ static inline u64 kcov_common_handle(void)
 }
 static inline void kcov_remote_start_common(u64 id) {}
 static inline void kcov_remote_start_usb(u64 id) {}
+static inline void kcov_remote_start_usb_softirq(u64 id) {}
+static inline void kcov_remote_stop_softirq(void) {}
 
 #endif /* CONFIG_KCOV */
 #endif /* _LINUX_KCOV_H */
-- 
cgit v1.2.3


From 9ad9f45b3b91162b33abfe175ae75ab65718dbf5 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Thu, 7 Jan 2021 00:03:55 +0800
Subject: iommu/vt-d: Move intel_iommu info from struct intel_svm to struct
 intel_svm_dev

'struct intel_svm' is shared by all devices bound to a give process,
but records only a single pointer to a 'struct intel_iommu'. Consequently,
cache invalidations may only be applied to a single DMAR unit, and are
erroneously skipped for the other devices.

In preparation for fixing this, rework the structures so that the iommu
pointer resides in 'struct intel_svm_dev', allowing 'struct intel_svm'
to track them in its device list.

Fixes: 1c4f88b7f1f9 ("iommu/vt-d: Shared virtual address in scalable mode")
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Raj Ashok <ashok.raj@intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Reported-by: Guo Kaijie <Kaijie.Guo@intel.com>
Reported-by: Xin Zeng <xin.zeng@intel.com>
Signed-off-by: Guo Kaijie <Kaijie.Guo@intel.com>
Signed-off-by: Xin Zeng <xin.zeng@intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Tested-by: Guo Kaijie <Kaijie.Guo@intel.com>
Cc: stable@vger.kernel.org # v5.0+
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1609949037-25291-2-git-send-email-yi.l.liu@intel.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/intel/svm.c   | 9 +++++----
 include/linux/intel-iommu.h | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 9bcedd360235..790ef3497e7e 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -142,7 +142,7 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d
 	}
 	desc.qw2 = 0;
 	desc.qw3 = 0;
-	qi_submit_sync(svm->iommu, &desc, 1, 0);
+	qi_submit_sync(sdev->iommu, &desc, 1, 0);
 
 	if (sdev->dev_iotlb) {
 		desc.qw0 = QI_DEV_EIOTLB_PASID(svm->pasid) |
@@ -166,7 +166,7 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d
 		}
 		desc.qw2 = 0;
 		desc.qw3 = 0;
-		qi_submit_sync(svm->iommu, &desc, 1, 0);
+		qi_submit_sync(sdev->iommu, &desc, 1, 0);
 	}
 }
 
@@ -211,7 +211,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	 */
 	rcu_read_lock();
 	list_for_each_entry_rcu(sdev, &svm->devs, list)
-		intel_pasid_tear_down_entry(svm->iommu, sdev->dev,
+		intel_pasid_tear_down_entry(sdev->iommu, sdev->dev,
 					    svm->pasid, true);
 	rcu_read_unlock();
 
@@ -364,6 +364,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev,
 	}
 	sdev->dev = dev;
 	sdev->sid = PCI_DEVID(info->bus, info->devfn);
+	sdev->iommu = iommu;
 
 	/* Only count users if device has aux domains */
 	if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
@@ -548,6 +549,7 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 		goto out;
 	}
 	sdev->dev = dev;
+	sdev->iommu = iommu;
 
 	ret = intel_iommu_enable_pasid(iommu, dev);
 	if (ret) {
@@ -577,7 +579,6 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags,
 			kfree(sdev);
 			goto out;
 		}
-		svm->iommu = iommu;
 
 		if (pasid_max > intel_pasid_max_id)
 			pasid_max = intel_pasid_max_id;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index d956987ed032..94522685a0d9 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -758,6 +758,7 @@ struct intel_svm_dev {
 	struct list_head list;
 	struct rcu_head rcu;
 	struct device *dev;
+	struct intel_iommu *iommu;
 	struct svm_dev_ops *ops;
 	struct iommu_sva sva;
 	u32 pasid;
@@ -771,7 +772,6 @@ struct intel_svm {
 	struct mmu_notifier notifier;
 	struct mm_struct *mm;
 
-	struct intel_iommu *iommu;
 	unsigned int flags;
 	u32 pasid;
 	int gpasid; /* In case that guest PASID is different from host PASID */
-- 
cgit v1.2.3


From 18abda7a2d555783d28ea1701f3ec95e96237a86 Mon Sep 17 00:00:00 2001
From: Liu Yi L <yi.l.liu@intel.com>
Date: Thu, 7 Jan 2021 00:03:56 +0800
Subject: iommu/vt-d: Fix general protection fault in aux_detach_device()

The aux-domain attach/detach are not tracked, some data structures might
be used after free. This causes general protection faults when multiple
subdevices are created and assigned to a same guest machine:

  | general protection fault, probably for non-canonical address 0xdead000000000100: 0000 [#1] SMP NOPTI
  | RIP: 0010:intel_iommu_aux_detach_device+0x12a/0x1f0
  | [...]
  | Call Trace:
  |  iommu_aux_detach_device+0x24/0x70
  |  vfio_mdev_detach_domain+0x3b/0x60
  |  ? vfio_mdev_set_domain+0x50/0x50
  |  iommu_group_for_each_dev+0x4f/0x80
  |  vfio_iommu_detach_group.isra.0+0x22/0x30
  |  vfio_iommu_type1_detach_group.cold+0x71/0x211
  |  ? find_exported_symbol_in_section+0x4a/0xd0
  |  ? each_symbol_section+0x28/0x50
  |  __vfio_group_unset_container+0x4d/0x150
  |  vfio_group_try_dissolve_container+0x25/0x30
  |  vfio_group_put_external_user+0x13/0x20
  |  kvm_vfio_group_put_external_user+0x27/0x40 [kvm]
  |  kvm_vfio_destroy+0x45/0xb0 [kvm]
  |  kvm_put_kvm+0x1bb/0x2e0 [kvm]
  |  kvm_vm_release+0x22/0x30 [kvm]
  |  __fput+0xcc/0x260
  |  ____fput+0xe/0x10
  |  task_work_run+0x8f/0xb0
  |  do_exit+0x358/0xaf0
  |  ? wake_up_state+0x10/0x20
  |  ? signal_wake_up_state+0x1a/0x30
  |  do_group_exit+0x47/0xb0
  |  __x64_sys_exit_group+0x18/0x20
  |  do_syscall_64+0x57/0x1d0
  |  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fix the crash by tracking the subdevices when attaching and detaching
aux-domains.

Fixes: 67b8e02b5e76 ("iommu/vt-d: Aux-domain specific domain attach/detach")
Co-developed-by: Xin Zeng <xin.zeng@intel.com>
Signed-off-by: Xin Zeng <xin.zeng@intel.com>
Signed-off-by: Liu Yi L <yi.l.liu@intel.com>
Acked-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/1609949037-25291-3-git-send-email-yi.l.liu@intel.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/iommu/intel/iommu.c | 95 +++++++++++++++++++++++++++++++++------------
 include/linux/intel-iommu.h | 16 +++++---
 2 files changed, 82 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 788119c5b021..d7720a836268 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1877,6 +1877,7 @@ static struct dmar_domain *alloc_domain(int flags)
 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
 	domain->has_iotlb_device = false;
 	INIT_LIST_HEAD(&domain->devices);
+	INIT_LIST_HEAD(&domain->subdevices);
 
 	return domain;
 }
@@ -2547,7 +2548,7 @@ static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
 	info->iommu = iommu;
 	info->pasid_table = NULL;
 	info->auxd_enabled = 0;
-	INIT_LIST_HEAD(&info->auxiliary_domains);
+	INIT_LIST_HEAD(&info->subdevices);
 
 	if (dev && dev_is_pci(dev)) {
 		struct pci_dev *pdev = to_pci_dev(info->dev);
@@ -4475,33 +4476,61 @@ is_aux_domain(struct device *dev, struct iommu_domain *domain)
 			domain->type == IOMMU_DOMAIN_UNMANAGED;
 }
 
-static void auxiliary_link_device(struct dmar_domain *domain,
-				  struct device *dev)
+static inline struct subdev_domain_info *
+lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
+{
+	struct subdev_domain_info *sinfo;
+
+	if (!list_empty(&domain->subdevices)) {
+		list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
+			if (sinfo->pdev == dev)
+				return sinfo;
+		}
+	}
+
+	return NULL;
+}
+
+static int auxiliary_link_device(struct dmar_domain *domain,
+				 struct device *dev)
 {
 	struct device_domain_info *info = get_domain_info(dev);
+	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
 
 	assert_spin_locked(&device_domain_lock);
 	if (WARN_ON(!info))
-		return;
+		return -EINVAL;
+
+	if (!sinfo) {
+		sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
+		sinfo->domain = domain;
+		sinfo->pdev = dev;
+		list_add(&sinfo->link_phys, &info->subdevices);
+		list_add(&sinfo->link_domain, &domain->subdevices);
+	}
 
-	domain->auxd_refcnt++;
-	list_add(&domain->auxd, &info->auxiliary_domains);
+	return ++sinfo->users;
 }
 
-static void auxiliary_unlink_device(struct dmar_domain *domain,
-				    struct device *dev)
+static int auxiliary_unlink_device(struct dmar_domain *domain,
+				   struct device *dev)
 {
 	struct device_domain_info *info = get_domain_info(dev);
+	struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
+	int ret;
 
 	assert_spin_locked(&device_domain_lock);
-	if (WARN_ON(!info))
-		return;
+	if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
+		return -EINVAL;
 
-	list_del(&domain->auxd);
-	domain->auxd_refcnt--;
+	ret = --sinfo->users;
+	if (!ret) {
+		list_del(&sinfo->link_phys);
+		list_del(&sinfo->link_domain);
+		kfree(sinfo);
+	}
 
-	if (!domain->auxd_refcnt && domain->default_pasid > 0)
-		ioasid_put(domain->default_pasid);
+	return ret;
 }
 
 static int aux_domain_add_dev(struct dmar_domain *domain,
@@ -4530,6 +4559,19 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 	}
 
 	spin_lock_irqsave(&device_domain_lock, flags);
+	ret = auxiliary_link_device(domain, dev);
+	if (ret <= 0)
+		goto link_failed;
+
+	/*
+	 * Subdevices from the same physical device can be attached to the
+	 * same domain. For such cases, only the first subdevice attachment
+	 * needs to go through the full steps in this function. So if ret >
+	 * 1, just goto out.
+	 */
+	if (ret > 1)
+		goto out;
+
 	/*
 	 * iommu->lock must be held to attach domain to iommu and setup the
 	 * pasid entry for second level translation.
@@ -4548,10 +4590,9 @@ static int aux_domain_add_dev(struct dmar_domain *domain,
 						     domain->default_pasid);
 	if (ret)
 		goto table_failed;
-	spin_unlock(&iommu->lock);
-
-	auxiliary_link_device(domain, dev);
 
+	spin_unlock(&iommu->lock);
+out:
 	spin_unlock_irqrestore(&device_domain_lock, flags);
 
 	return 0;
@@ -4560,8 +4601,10 @@ table_failed:
 	domain_detach_iommu(domain, iommu);
 attach_failed:
 	spin_unlock(&iommu->lock);
+	auxiliary_unlink_device(domain, dev);
+link_failed:
 	spin_unlock_irqrestore(&device_domain_lock, flags);
-	if (!domain->auxd_refcnt && domain->default_pasid > 0)
+	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
 		ioasid_put(domain->default_pasid);
 
 	return ret;
@@ -4581,14 +4624,18 @@ static void aux_domain_remove_dev(struct dmar_domain *domain,
 	info = get_domain_info(dev);
 	iommu = info->iommu;
 
-	auxiliary_unlink_device(domain, dev);
-
-	spin_lock(&iommu->lock);
-	intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
-	domain_detach_iommu(domain, iommu);
-	spin_unlock(&iommu->lock);
+	if (!auxiliary_unlink_device(domain, dev)) {
+		spin_lock(&iommu->lock);
+		intel_pasid_tear_down_entry(iommu, dev,
+					    domain->default_pasid, false);
+		domain_detach_iommu(domain, iommu);
+		spin_unlock(&iommu->lock);
+	}
 
 	spin_unlock_irqrestore(&device_domain_lock, flags);
+
+	if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
+		ioasid_put(domain->default_pasid);
 }
 
 static int prepare_domain_attach_device(struct iommu_domain *domain,
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 94522685a0d9..09c6a0bf3892 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -533,11 +533,10 @@ struct dmar_domain {
 					/* Domain ids per IOMMU. Use u16 since
 					 * domain ids are 16 bit wide according
 					 * to VT-d spec, section 9.3 */
-	unsigned int	auxd_refcnt;	/* Refcount of auxiliary attaching */
 
 	bool has_iotlb_device;
 	struct list_head devices;	/* all devices' list */
-	struct list_head auxd;		/* link to device's auxiliary list */
+	struct list_head subdevices;	/* all subdevices' list */
 	struct iova_domain iovad;	/* iova's that belong to this domain */
 
 	struct dma_pte	*pgd;		/* virtual address */
@@ -610,14 +609,21 @@ struct intel_iommu {
 	struct dmar_drhd_unit *drhd;
 };
 
+/* Per subdevice private data */
+struct subdev_domain_info {
+	struct list_head link_phys;	/* link to phys device siblings */
+	struct list_head link_domain;	/* link to domain siblings */
+	struct device *pdev;		/* physical device derived from */
+	struct dmar_domain *domain;	/* aux-domain */
+	int users;			/* user count */
+};
+
 /* PCI domain-device relationship */
 struct device_domain_info {
 	struct list_head link;	/* link to domain siblings */
 	struct list_head global; /* link to global list */
 	struct list_head table;	/* link to pasid table */
-	struct list_head auxiliary_domains; /* auxiliary domains
-					     * attached to this device
-					     */
+	struct list_head subdevices; /* subdevices sibling */
 	u32 segment;		/* PCI segment number */
 	u8 bus;			/* PCI bus number */
 	u8 devfn;		/* PCI devfn number */
-- 
cgit v1.2.3


From ee61cfd955a64a58ed35cbcfc54068fcbd486945 Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Thu, 31 Dec 2020 19:35:25 +0800
Subject: ACPI: scan: add stub acpi_create_platform_device() for !CONFIG_ACPI

It adds a stub acpi_create_platform_device() for !CONFIG_ACPI build, so
that caller doesn't have to deal with !CONFIG_ACPI build issue.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 2630c2e953f7..053bf05fb1f7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -885,6 +885,13 @@ static inline int acpi_device_modalias(struct device *dev,
 	return -ENODEV;
 }
 
+static inline struct platform_device *
+acpi_create_platform_device(struct acpi_device *adev,
+			    struct property_entry *properties)
+{
+	return NULL;
+}
+
 static inline bool acpi_dma_supported(struct acpi_device *adev)
 {
 	return false;
-- 
cgit v1.2.3


From 9c9be85f6b59d80efe4705109c0396df18d4e11d Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@nvidia.com>
Date: Tue, 24 Nov 2020 22:16:23 +0200
Subject: net/mlx5e: Add missing capability check for uplink follow

Expose firmware indication that it supports setting eswitch uplink state
to follow (follow the physical link). Condition setting the eswitch
uplink admin-state with this capability bit. Older FW may not support
the uplink state setting.

Fixes: 7d0314b11cdd ("net/mlx5e: Modify uplink state on interface up/down")
Signed-off-by: Aya Levin <ayal@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 ++-
 include/linux/mlx5/mlx5_ifc.h                     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 7a79d330c075..6a852b4901aa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3161,7 +3161,8 @@ static void mlx5e_modify_admin_state(struct mlx5_core_dev *mdev,
 
 	mlx5_set_port_admin_status(mdev, state);
 
-	if (mlx5_eswitch_mode(mdev) != MLX5_ESWITCH_LEGACY)
+	if (mlx5_eswitch_mode(mdev) == MLX5_ESWITCH_OFFLOADS ||
+	    !MLX5_CAP_GEN(mdev, uplink_follow))
 		return;
 
 	if (state == MLX5_PORT_UP)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 8fbddec26eb8..442c0160caab 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1280,7 +1280,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   ece_support[0x1];
 	u8	   reserved_at_a4[0x7];
 	u8         log_max_srq[0x5];
-	u8         reserved_at_b0[0x2];
+	u8         reserved_at_b0[0x1];
+	u8         uplink_follow[0x1];
 	u8         ts_cqe_to_dest_cqn[0x1];
 	u8         reserved_at_b3[0xd];
 
-- 
cgit v1.2.3


From a91bd6223ecd46addc71ee6fcd432206d39365d2 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 8 Jan 2021 12:48:47 +0100
Subject: Revert "init/console: Use ttynull as a fallback when there is no
 console"

This reverts commit 757055ae8dedf5333af17b3b5b4b70ba9bc9da4e.

The commit caused that ttynull was used as the default console
on several systems[1][2][3]. As a result, the console was
blank even when a better alternative existed.

It happened when there was no console configured
on the command line and ttynull_init() was the first initcall
calling register_console().

Or it happened when /dev/ did not exist when console_on_rootfs()
was called. It was not able to open /dev/console even though
a console driver was registered. It tried to add ttynull console
but it obviously did not help. But ttynull became the preferred
console and was used by /dev/console when it was available later.

The commit tried to fix a historical problem that have been there
for ages. The primary motivation was the commit 3cffa06aeef7ece30f6
("printk/console: Allow to disable console output by using console=""
 or console=null"). It provided a clean solution for a workaround
 that was widely used and worked only by chance.

This revert causes that the console="" or console=null command line
options will again work only by chance. These options will cause that
a particular console will be preferred and the default (tty) ones
will not get enabled. There will be no console registered at
all. As a result there won't be stdin, stdout, and stderr for
the init process. But it worked exactly this way even before.

The proper solution has to fulfill many conditions:

  + Register ttynull only when explicitly required or as
    the ultimate fallback.

  + ttynull should get associated with /dev/console but it must
    not become preferred console when used as a fallback.
    Especially, it must still be possible to replace it
    by a better console later.

Such a change requires clean up of the register_console() code.
Otherwise, it would be even harder to follow. Especially, the use
of has_preferred_console and CON_CONSDEV flag is tricky. The clean
up is risky. The ordering of consoles is not well defined. And
any changes tend to break existing user settings.

Do the revert at the least risky solution for now.

[1] https://lore.kernel.org/linux-kselftest/20201221144302.GR4077@smile.fi.intel.com/
[2] https://lore.kernel.org/lkml/d2a3b3c0-e548-7dd1-730f-59bc5c04e191@synopsys.com/
[3] https://patchwork.ozlabs.org/project/linux-um/patch/20210105120128.10854-1-thomas@m3y3r.de/

Reported-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reported-by: Vineet Gupta <vgupta@synopsys.com>
Reported-by: Thomas Meyer <thomas@m3y3r.de>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/Kconfig     | 14 ++++++++++++++
 drivers/tty/Makefile    |  3 ++-
 drivers/tty/ttynull.c   | 18 ------------------
 include/linux/console.h |  3 ---
 init/main.c             | 10 ++--------
 5 files changed, 18 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index 47a6e42f0d04..e15cd6b5bb99 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -401,6 +401,20 @@ config MIPS_EJTAG_FDC_KGDB_CHAN
 	help
 	  FDC channel number to use for KGDB.
 
+config NULL_TTY
+	tristate "NULL TTY driver"
+	help
+	  Say Y here if you want a NULL TTY which simply discards messages.
+
+	  This is useful to allow userspace applications which expect a console
+	  device to work without modifications even when no console is
+	  available or desired.
+
+	  In order to use this driver, you should redirect the console to this
+	  TTY, or boot the kernel with console=ttynull.
+
+	  If unsure, say N.
+
 config TRACE_ROUTER
 	tristate "Trace data router for MIPI P1149.7 cJTAG standard"
 	depends on TRACE_SINK
diff --git a/drivers/tty/Makefile b/drivers/tty/Makefile
index 3c1c5a9240a7..b3ccae932660 100644
--- a/drivers/tty/Makefile
+++ b/drivers/tty/Makefile
@@ -2,7 +2,7 @@
 obj-$(CONFIG_TTY)		+= tty_io.o n_tty.o tty_ioctl.o tty_ldisc.o \
 				   tty_buffer.o tty_port.o tty_mutex.o \
 				   tty_ldsem.o tty_baudrate.o tty_jobctrl.o \
-				   n_null.o ttynull.o
+				   n_null.o
 obj-$(CONFIG_LEGACY_PTYS)	+= pty.o
 obj-$(CONFIG_UNIX98_PTYS)	+= pty.o
 obj-$(CONFIG_AUDIT)		+= tty_audit.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_ISI)		+= isicom.o
 obj-$(CONFIG_MOXA_INTELLIO)	+= moxa.o
 obj-$(CONFIG_MOXA_SMARTIO)	+= mxser.o
 obj-$(CONFIG_NOZOMI)		+= nozomi.o
+obj-$(CONFIG_NULL_TTY)	        += ttynull.o
 obj-$(CONFIG_ROCKETPORT)	+= rocket.o
 obj-$(CONFIG_SYNCLINK_GT)	+= synclink_gt.o
 obj-$(CONFIG_PPC_EPAPR_HV_BYTECHAN) += ehv_bytechan.o
diff --git a/drivers/tty/ttynull.c b/drivers/tty/ttynull.c
index eced70ec54e1..17f05b7eb6d3 100644
--- a/drivers/tty/ttynull.c
+++ b/drivers/tty/ttynull.c
@@ -2,13 +2,6 @@
 /*
  * Copyright (C) 2019 Axis Communications AB
  *
- * The console is useful for userspace applications which expect a console
- * device to work without modifications even when no console is available
- * or desired.
- *
- * In order to use this driver, you should redirect the console to this
- * TTY, or boot the kernel with console=ttynull.
- *
  * Based on ttyprintk.c:
  *  Copyright (C) 2010 Samo Pogacnik
  */
@@ -66,17 +59,6 @@ static struct console ttynull_console = {
 	.device = ttynull_device,
 };
 
-void __init register_ttynull_console(void)
-{
-	if (!ttynull_driver)
-		return;
-
-	if (add_preferred_console(ttynull_console.name, 0, NULL))
-		return;
-
-	register_console(&ttynull_console);
-}
-
 static int __init ttynull_init(void)
 {
 	struct tty_driver *driver;
diff --git a/include/linux/console.h b/include/linux/console.h
index dbe78e8e2602..20874db50bc8 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -186,12 +186,9 @@ extern int braille_register_console(struct console *, int index,
 extern int braille_unregister_console(struct console *);
 #ifdef CONFIG_TTY
 extern void console_sysfs_notify(void);
-extern void register_ttynull_console(void);
 #else
 static inline void console_sysfs_notify(void)
 { }
-static inline void register_ttynull_console(void)
-{ }
 #endif
 extern bool console_suspend_enabled;
 
diff --git a/init/main.c b/init/main.c
index 421640fca375..c68d784376ca 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1480,14 +1480,8 @@ void __init console_on_rootfs(void)
 	struct file *file = filp_open("/dev/console", O_RDWR, 0);
 
 	if (IS_ERR(file)) {
-		pr_err("Warning: unable to open an initial console. Fallback to ttynull.\n");
-		register_ttynull_console();
-
-		file = filp_open("/dev/console", O_RDWR, 0);
-		if (IS_ERR(file)) {
-			pr_err("Warning: Failed to add ttynull console. No stdin, stdout, and stderr for the init process!\n");
-			return;
-		}
+		pr_err("Warning: unable to open an initial console.\n");
+		return;
 	}
 	init_dup(file);
 	init_dup(file);
-- 
cgit v1.2.3


From 9b5948267adc9e689da609eb61cf7ed49cae5fa8 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 8 Jan 2021 11:15:56 -0500
Subject: dm integrity: fix flush with external metadata device

With external metadata device, flush requests are not passed down to the
data device.

Fix this by submitting the flush request in dm_integrity_flush_buffers. In
order to not degrade performance, we overlap the data device flush with
the metadata device flush.

Reported-by: Lukas Straub <lukasstraub2@web.de>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bufio.c     |  6 +++++
 drivers/md/dm-integrity.c | 60 ++++++++++++++++++++++++++++++++++++++---------
 include/linux/dm-bufio.h  |  1 +
 3 files changed, 56 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 9c1a86bde658..fce4cbf9529d 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1534,6 +1534,12 @@ sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
 }
 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
 
+struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
+{
+	return c->dm_io;
+}
+EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
+
 sector_t dm_bufio_get_block_number(struct dm_buffer *b)
 {
 	return b->block;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 5a7a1b90e671..11c7c538f7a9 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1379,12 +1379,52 @@ thorough_test:
 #undef MAY_BE_HASH
 }
 
-static void dm_integrity_flush_buffers(struct dm_integrity_c *ic)
+struct flush_request {
+	struct dm_io_request io_req;
+	struct dm_io_region io_reg;
+	struct dm_integrity_c *ic;
+	struct completion comp;
+};
+
+static void flush_notify(unsigned long error, void *fr_)
+{
+	struct flush_request *fr = fr_;
+	if (unlikely(error != 0))
+		dm_integrity_io_error(fr->ic, "flusing disk cache", -EIO);
+	complete(&fr->comp);
+}
+
+static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_data)
 {
 	int r;
+
+	struct flush_request fr;
+
+	if (!ic->meta_dev)
+		flush_data = false;
+	if (flush_data) {
+		fr.io_req.bi_op = REQ_OP_WRITE,
+		fr.io_req.bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
+		fr.io_req.mem.type = DM_IO_KMEM,
+		fr.io_req.mem.ptr.addr = NULL,
+		fr.io_req.notify.fn = flush_notify,
+		fr.io_req.notify.context = &fr;
+		fr.io_req.client = dm_bufio_get_dm_io_client(ic->bufio),
+		fr.io_reg.bdev = ic->dev->bdev,
+		fr.io_reg.sector = 0,
+		fr.io_reg.count = 0,
+		fr.ic = ic;
+		init_completion(&fr.comp);
+		r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL);
+		BUG_ON(r);
+	}
+
 	r = dm_bufio_write_dirty_buffers(ic->bufio);
 	if (unlikely(r))
 		dm_integrity_io_error(ic, "writing tags", r);
+
+	if (flush_data)
+		wait_for_completion(&fr.comp);
 }
 
 static void sleep_on_endio_wait(struct dm_integrity_c *ic)
@@ -2110,7 +2150,7 @@ offload_to_thread:
 
 	if (unlikely(dio->op == REQ_OP_DISCARD) && likely(ic->mode != 'D')) {
 		integrity_metadata(&dio->work);
-		dm_integrity_flush_buffers(ic);
+		dm_integrity_flush_buffers(ic, false);
 
 		dio->in_flight = (atomic_t)ATOMIC_INIT(1);
 		dio->completion = NULL;
@@ -2195,7 +2235,7 @@ static void integrity_commit(struct work_struct *w)
 	flushes = bio_list_get(&ic->flush_bio_list);
 	if (unlikely(ic->mode != 'J')) {
 		spin_unlock_irq(&ic->endio_wait.lock);
-		dm_integrity_flush_buffers(ic);
+		dm_integrity_flush_buffers(ic, true);
 		goto release_flush_bios;
 	}
 
@@ -2409,7 +2449,7 @@ skip_io:
 	complete_journal_op(&comp);
 	wait_for_completion_io(&comp.comp);
 
-	dm_integrity_flush_buffers(ic);
+	dm_integrity_flush_buffers(ic, true);
 }
 
 static void integrity_writer(struct work_struct *w)
@@ -2451,7 +2491,7 @@ static void recalc_write_super(struct dm_integrity_c *ic)
 {
 	int r;
 
-	dm_integrity_flush_buffers(ic);
+	dm_integrity_flush_buffers(ic, false);
 	if (dm_integrity_failed(ic))
 		return;
 
@@ -2654,7 +2694,7 @@ static void bitmap_flush_work(struct work_struct *work)
 	unsigned long limit;
 	struct bio *bio;
 
-	dm_integrity_flush_buffers(ic);
+	dm_integrity_flush_buffers(ic, false);
 
 	range.logical_sector = 0;
 	range.n_sectors = ic->provided_data_sectors;
@@ -2663,9 +2703,7 @@ static void bitmap_flush_work(struct work_struct *work)
 	add_new_range_and_wait(ic, &range);
 	spin_unlock_irq(&ic->endio_wait.lock);
 
-	dm_integrity_flush_buffers(ic);
-	if (ic->meta_dev)
-		blkdev_issue_flush(ic->dev->bdev, GFP_NOIO);
+	dm_integrity_flush_buffers(ic, true);
 
 	limit = ic->provided_data_sectors;
 	if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) {
@@ -2934,11 +2972,11 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
 		if (ic->meta_dev)
 			queue_work(ic->writer_wq, &ic->writer_work);
 		drain_workqueue(ic->writer_wq);
-		dm_integrity_flush_buffers(ic);
+		dm_integrity_flush_buffers(ic, true);
 	}
 
 	if (ic->mode == 'B') {
-		dm_integrity_flush_buffers(ic);
+		dm_integrity_flush_buffers(ic, true);
 #if 1
 		/* set to 0 to test bitmap replay code */
 		init_journal(ic, 0, ic->journal_sections, 0);
diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h
index 29d255fdd5d6..90bd558a17f5 100644
--- a/include/linux/dm-bufio.h
+++ b/include/linux/dm-bufio.h
@@ -150,6 +150,7 @@ void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n);
 
 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c);
 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c);
+struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c);
 sector_t dm_bufio_get_block_number(struct dm_buffer *b);
 void *dm_bufio_get_block_data(struct dm_buffer *b);
 void *dm_bufio_get_aux_data(struct dm_buffer *b);
-- 
cgit v1.2.3


From 29766bcffad03da66892bef82674883e31f78fec Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 9 Jan 2021 17:18:32 -0500
Subject: net: support kmap_local forced debugging in skb_frag_foreach

Skb frags may be backed by highmem and/or compound pages. Highmem
pages need kmap_atomic mappings to access. But kmap_atomic maps a
single page, not the entire compound page.

skb_foreach_page iterates over an skb frag, in one step in the common
case, page by page only if kmap_atomic must be called for each page.
The decision logic is captured in skb_frag_must_loop.

CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP extends kmap from highmem to all
pages, to increase code coverage.

Extend skb_frag_must_loop to this new condition.

Link: https://lore.kernel.org/linux-mm/20210106180132.41dc249d@gandalf.local.home/
Fixes: 0e91a0c6984c ("mm/highmem: Provide CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP")
Reported-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Tested-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 333bcdc39635..c858adfb5a82 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -366,7 +366,7 @@ static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
 static inline bool skb_frag_must_loop(struct page *p)
 {
 #if defined(CONFIG_HIGHMEM)
-	if (PageHighMem(p))
+	if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p))
 		return true;
 #endif
 	return false;
-- 
cgit v1.2.3


From 97550f6fa59254435d864b92603de3ca4b5a99f8 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Sat, 9 Jan 2021 17:18:33 -0500
Subject: net: compound page support in skb_seq_read

skb_seq_read iterates over an skb, returning pointer and length of
the next data range with each call.

It relies on kmap_atomic to access highmem pages when needed.

An skb frag may be backed by a compound page, but kmap_atomic maps
only a single page. There are not enough kmap slots to always map all
pages concurrently.

Instead, if kmap_atomic is needed, iterate over each page.

As this increases the number of calls, avoid this unless needed.
The necessary condition is captured in skb_frag_must_loop.

I tried to make the change as obvious as possible. It should be easy
to verify that nothing changes if skb_frag_must_loop returns false.

Tested:
  On an x86 platform with
    CONFIG_HIGHMEM=y
    CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP=y
    CONFIG_NETFILTER_XT_MATCH_STRING=y

  Run
    ip link set dev lo mtu 1500
    iptables -A OUTPUT -m string --string 'badstring' -algo bm -j ACCEPT
    dd if=/dev/urandom of=in bs=1M count=20
    nc -l -p 8000 > /dev/null &
    nc -w 1 -q 0 localhost 8000 < in

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h |  1 +
 net/core/skbuff.c      | 28 +++++++++++++++++++++++-----
 2 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c858adfb5a82..5f60c9e907c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1203,6 +1203,7 @@ struct skb_seq_state {
 	struct sk_buff	*root_skb;
 	struct sk_buff	*cur_skb;
 	__u8		*frag_data;
+	__u32		frag_off;
 };
 
 void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b6f2b520a9b7..0da035c1e53f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3442,6 +3442,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
 	st->root_skb = st->cur_skb = skb;
 	st->frag_idx = st->stepped_offset = 0;
 	st->frag_data = NULL;
+	st->frag_off = 0;
 }
 EXPORT_SYMBOL(skb_prepare_seq_read);
 
@@ -3496,14 +3497,27 @@ next_skb:
 		st->stepped_offset += skb_headlen(st->cur_skb);
 
 	while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+		unsigned int pg_idx, pg_off, pg_sz;
+
 		frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
-		block_limit = skb_frag_size(frag) + st->stepped_offset;
 
+		pg_idx = 0;
+		pg_off = skb_frag_off(frag);
+		pg_sz = skb_frag_size(frag);
+
+		if (skb_frag_must_loop(skb_frag_page(frag))) {
+			pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
+			pg_off = offset_in_page(pg_off + st->frag_off);
+			pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
+						    PAGE_SIZE - pg_off);
+		}
+
+		block_limit = pg_sz + st->stepped_offset;
 		if (abs_offset < block_limit) {
 			if (!st->frag_data)
-				st->frag_data = kmap_atomic(skb_frag_page(frag));
+				st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
 
-			*data = (u8 *) st->frag_data + skb_frag_off(frag) +
+			*data = (u8 *)st->frag_data + pg_off +
 				(abs_offset - st->stepped_offset);
 
 			return block_limit - abs_offset;
@@ -3514,8 +3528,12 @@ next_skb:
 			st->frag_data = NULL;
 		}
 
-		st->frag_idx++;
-		st->stepped_offset += skb_frag_size(frag);
+		st->stepped_offset += pg_sz;
+		st->frag_off += pg_sz;
+		if (st->frag_off == skb_frag_size(frag)) {
+			st->frag_off = 0;
+			st->frag_idx++;
+		}
 	}
 
 	if (st->frag_data) {
-- 
cgit v1.2.3


From aba428a0c612bb259891307da12e22efd0fab14c Mon Sep 17 00:00:00 2001
From: Chunguang Xu <brookxu@tencent.com>
Date: Tue, 1 Dec 2020 17:52:31 +0800
Subject: timekeeping: Remove unused get_seconds()

The get_seconds() cleanup seems to have been completed, now it is
time to delete the legacy interface to avoid misuse later.

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/1606816351-26900-1-git-send-email-brookxu@tencent.com
---
 include/linux/ktime.h         |  1 -
 include/linux/timekeeping32.h | 14 --------------
 kernel/time/timekeeping.c     |  3 +--
 3 files changed, 1 insertion(+), 17 deletions(-)
 delete mode 100644 include/linux/timekeeping32.h

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index a12b5523cc18..73f20deb497d 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -230,6 +230,5 @@ static inline ktime_t ms_to_ktime(u64 ms)
 }
 
 # include <linux/timekeeping.h>
-# include <linux/timekeeping32.h>
 
 #endif
diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h
deleted file mode 100644
index 266017fc9ee9..000000000000
--- a/include/linux/timekeeping32.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef _LINUX_TIMEKEEPING32_H
-#define _LINUX_TIMEKEEPING32_H
-/*
- * These interfaces are all based on the old timespec type
- * and should get replaced with the timespec64 based versions
- * over time so we can remove the file here.
- */
-
-static inline unsigned long get_seconds(void)
-{
-	return ktime_get_real_seconds();
-}
-
-#endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a45cedda93a7..6aee5768c86f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -991,8 +991,7 @@ EXPORT_SYMBOL_GPL(ktime_get_seconds);
 /**
  * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
  *
- * Returns the wall clock seconds since 1970. This replaces the
- * get_seconds() interface which is not y2038 safe on 32bit systems.
+ * Returns the wall clock seconds since 1970.
  *
  * For 64bit systems the fast access to tk->xtime_sec is preserved. On
  * 32bit systems the access must be protected with the sequence
-- 
cgit v1.2.3


From 7ea510b92c7c9b4eb5ff72e6b4bbad4b0407a914 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 12 Jan 2021 15:49:11 -0800
Subject: mm/memcontrol: fix warning in mem_cgroup_page_lruvec()

Boot a CONFIG_MEMCG=y kernel with "cgroup_disabled=memory" and you are
met by a series of warnings from the VM_WARN_ON_ONCE_PAGE(!memcg, page)
recently added to the inline mem_cgroup_page_lruvec().

An earlier attempt to place that warning, in mem_cgroup_lruvec(), had
been careful to do so after weeding out the mem_cgroup_disabled() case;
but was itself invalid because of the mem_cgroup_lruvec(NULL, pgdat) in
clear_pgdat_congested() and age_active_anon().

Warning in mem_cgroup_page_lruvec() was once useful in detecting a KSM
charge bug, so may be worth keeping: but skip if mem_cgroup_disabled().

Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2101032056260.1093@eggly.anvils
Fixes: 9a1ac2288cf1 ("mm/memcontrol:rewrite mem_cgroup_page_lruvec()")
Signed-off-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Alex Shi <alex.shi@linux.alibaba.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Baoquan He <bhe@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hui Su <sh_def@163.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d827bd7f3bfe..eeb0b52203e9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -665,7 +665,7 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
 {
 	struct mem_cgroup *memcg = page_memcg(page);
 
-	VM_WARN_ON_ONCE_PAGE(!memcg, page);
+	VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page);
 	return mem_cgroup_lruvec(memcg, pgdat);
 }
 
-- 
cgit v1.2.3


From 29970dc24faf0078beb4efab5455b4f504d2198d Mon Sep 17 00:00:00 2001
From: Hailong Liu <liu.hailong6@zte.com.cn>
Date: Tue, 12 Jan 2021 15:49:14 -0800
Subject: arm/kasan: fix the array size of kasan_early_shadow_pte[]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The size of kasan_early_shadow_pte[] now is PTRS_PER_PTE which defined
to 512 for arm.  This means that it only covers the prev Linux pte
entries, but not the HWTABLE pte entries for arm.

The reason it currently works is that the symbol kasan_early_shadow_page
immediately following kasan_early_shadow_pte in memory is page aligned,
which makes kasan_early_shadow_pte look like a 4KB size array.  But we
can't ensure the order is always right with different compiler/linker,
or if more bss symbols are introduced.

We had a test with QEMU + vexpress：put a 512KB-size symbol with
attribute __section(".bss..page_aligned") after kasan_early_shadow_pte,
and poisoned it after kasan_early_init().  Then enabled CONFIG_KASAN, it
failed to boot up.

Link: https://lkml.kernel.org/r/20210109044622.8312-1-hailongliiu@yeah.net
Signed-off-by: Hailong Liu <liu.hailong6@zte.com.cn>
Signed-off-by: Ziliang Guo <guo.ziliang@zte.com.cn>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 6 +++++-
 mm/kasan/init.c       | 3 ++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 5e0655fb2a6f..fe1ae73ff8b5 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -35,8 +35,12 @@ struct kunit_kasan_expectation {
 #define KASAN_SHADOW_INIT 0
 #endif
 
+#ifndef PTE_HWTABLE_PTRS
+#define PTE_HWTABLE_PTRS 0
+#endif
+
 extern unsigned char kasan_early_shadow_page[PAGE_SIZE];
-extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE];
+extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS];
 extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD];
 extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD];
 extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D];
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index bc0ad208b3a7..7ca0b92d5886 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -64,7 +64,8 @@ static inline bool kasan_pmd_table(pud_t pud)
 	return false;
 }
 #endif
-pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss;
+pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS]
+	__page_aligned_bss;
 
 static inline bool kasan_pte_table(pmd_t pmd)
 {
-- 
cgit v1.2.3


From b90d72a6bfdb5e5c62cd223a8cdf4045bfbcb94d Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 12 Jan 2021 22:18:55 +0000
Subject: Revert "arm64: Enable perf events based hard lockup detector"

This reverts commit 367c820ef08082e68df8a3bc12e62393af21e4b5.

lockup_detector_init() makes heavy use of per-cpu variables and must be
called with preemption disabled. Usually, it's handled early during boot
in kernel_init_freeable(), before SMP has been initialised.

Since we do not know whether or not our PMU interrupt can be signalled
as an NMI until considerably later in the boot process, the Arm PMU
driver attempts to re-initialise the lockup detector off the back of a
device_initcall(). Unfortunately, this is called from preemptible
context and results in the following splat:

  | BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
  | caller is debug_smp_processor_id+0x20/0x2c
  | CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.10.0+ #276
  | Hardware name: linux,dummy-virt (DT)
  | Call trace:
  |   dump_backtrace+0x0/0x3c0
  |   show_stack+0x20/0x6c
  |   dump_stack+0x2f0/0x42c
  |   check_preemption_disabled+0x1cc/0x1dc
  |   debug_smp_processor_id+0x20/0x2c
  |   hardlockup_detector_event_create+0x34/0x18c
  |   hardlockup_detector_perf_init+0x2c/0x134
  |   watchdog_nmi_probe+0x18/0x24
  |   lockup_detector_init+0x44/0xa8
  |   armv8_pmu_driver_init+0x54/0x78
  |   do_one_initcall+0x184/0x43c
  |   kernel_init_freeable+0x368/0x380
  |   kernel_init+0x1c/0x1cc
  |   ret_from_fork+0x10/0x30

Rather than bodge this with raw_smp_processor_id() or randomly disabling
preemption, simply revert the culprit for now until we figure out how to
do this properly.

Reported-by: Lecopzer Chen <lecopzer.chen@mediatek.com>
Signed-off-by: Will Deacon <will@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Sumit Garg <sumit.garg@linaro.org>
Cc: Alexandru Elisei <alexandru.elisei@arm.com>
Link: https://lore.kernel.org/r/20201221162249.3119-1-lecopzer.chen@mediatek.com
Link: https://lore.kernel.org/r/20210112221855.10666-1-will@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig             |  2 --
 arch/arm64/kernel/perf_event.c | 41 ++---------------------------------------
 drivers/perf/arm_pmu.c         |  5 -----
 include/linux/perf/arm_pmu.h   |  2 --
 4 files changed, 2 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 05e17351e4f3..f39568b28ec1 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -174,8 +174,6 @@ config ARM64
 	select HAVE_NMI
 	select HAVE_PATA_PLATFORM
 	select HAVE_PERF_EVENTS
-	select HAVE_PERF_EVENTS_NMI if ARM64_PSEUDO_NMI && HW_PERF_EVENTS
-	select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 38bb07eff872..3605f77ad4df 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -23,8 +23,6 @@
 #include <linux/platform_device.h>
 #include <linux/sched_clock.h>
 #include <linux/smp.h>
-#include <linux/nmi.h>
-#include <linux/cpufreq.h>
 
 /* ARMv8 Cortex-A53 specific event types. */
 #define ARMV8_A53_PERFCTR_PREF_LINEFILL				0xC2
@@ -1250,21 +1248,10 @@ static struct platform_driver armv8_pmu_driver = {
 
 static int __init armv8_pmu_driver_init(void)
 {
-	int ret;
-
 	if (acpi_disabled)
-		ret = platform_driver_register(&armv8_pmu_driver);
+		return platform_driver_register(&armv8_pmu_driver);
 	else
-		ret = arm_pmu_acpi_probe(armv8_pmuv3_init);
-
-	/*
-	 * Try to re-initialize lockup detector after PMU init in
-	 * case PMU events are triggered via NMIs.
-	 */
-	if (ret == 0 && arm_pmu_irq_is_nmi())
-		lockup_detector_init();
-
-	return ret;
+		return arm_pmu_acpi_probe(armv8_pmuv3_init);
 }
 device_initcall(armv8_pmu_driver_init)
 
@@ -1322,27 +1309,3 @@ void arch_perf_update_userpage(struct perf_event *event,
 	userpg->cap_user_time_zero = 1;
 	userpg->cap_user_time_short = 1;
 }
-
-#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
-/*
- * Safe maximum CPU frequency in case a particular platform doesn't implement
- * cpufreq driver. Although, architecture doesn't put any restrictions on
- * maximum frequency but 5 GHz seems to be safe maximum given the available
- * Arm CPUs in the market which are clocked much less than 5 GHz. On the other
- * hand, we can't make it much higher as it would lead to a large hard-lockup
- * detection timeout on parts which are running slower (eg. 1GHz on
- * Developerbox) and doesn't possess a cpufreq driver.
- */
-#define SAFE_MAX_CPU_FREQ	5000000000UL // 5 GHz
-u64 hw_nmi_get_sample_period(int watchdog_thresh)
-{
-	unsigned int cpu = smp_processor_id();
-	unsigned long max_cpu_freq;
-
-	max_cpu_freq = cpufreq_get_hw_max_freq(cpu) * 1000UL;
-	if (!max_cpu_freq)
-		max_cpu_freq = SAFE_MAX_CPU_FREQ;
-
-	return (u64)max_cpu_freq * watchdog_thresh;
-}
-#endif
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 794a37d50853..cb2f55f450e4 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -726,11 +726,6 @@ static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu)
 	return per_cpu(hw_events->irq, cpu);
 }
 
-bool arm_pmu_irq_is_nmi(void)
-{
-	return has_nmi;
-}
-
 /*
  * PMU hardware loses all context when a CPU goes offline.
  * When a CPU is hotplugged back in, since some hardware registers are
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index bf7966776c55..505480217cf1 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -163,8 +163,6 @@ int arm_pmu_acpi_probe(armpmu_init_fn init_fn);
 static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; }
 #endif
 
-bool arm_pmu_irq_is_nmi(void);
-
 /* Internal functions only for core arm_pmu code */
 struct arm_pmu *armpmu_alloc(void);
 struct arm_pmu *armpmu_alloc_atomic(void);
-- 
cgit v1.2.3


From dca5244d2f5b94f1809f0c02a549edf41ccd5493 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 12 Jan 2021 22:48:32 +0000
Subject: compiler.h: Raise minimum version of GCC to 5.1 for arm64

GCC versions >= 4.9 and < 5.1 have been shown to emit memory references
beyond the stack pointer, resulting in memory corruption if an interrupt
is taken after the stack pointer has been adjusted but before the
reference has been executed. This leads to subtle, infrequent data
corruption such as the EXT4 problems reported by Russell King at the
link below.

Life is too short for buggy compilers, so raise the minimum GCC version
required by arm64 to 5.1.

Reported-by: Russell King <linux@armlinux.org.uk>
Suggested-by: Arnd Bergmann <arnd@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
Tested-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@vger.kernel.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Florian Weimer <fweimer@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/20210105154726.GD1551@shell.armlinux.org.uk
Link: https://lore.kernel.org/r/20210112224832.10980-1-will@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/compiler-gcc.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 74c6c0486eed..555ab0fddbef 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -13,6 +13,12 @@
 /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 */
 #if GCC_VERSION < 40900
 # error Sorry, your version of GCC is too old - please use 4.9 or newer.
+#elif defined(CONFIG_ARM64) && GCC_VERSION < 50100
+/*
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63293
+ * https://lore.kernel.org/r/20210107111841.GN1551@shell.armlinux.org.uk
+ */
+# error Sorry, your version of GCC is too old - please use 5.1 or newer.
 #endif
 
 /*
-- 
cgit v1.2.3


From 20d3bb92e84d417b0494a3b6867f0c86713db257 Mon Sep 17 00:00:00 2001
From: Klaus Jensen <k.jensen@samsung.com>
Date: Fri, 15 Jan 2021 07:30:46 +0100
Subject: nvme-pci: allow use of cmb on v1.4 controllers

Since NVMe v1.4 the Controller Memory Buffer must be explicitly enabled
by the host.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
[hch: avoid a local variable and add a comment]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 14 ++++++++++++++
 include/linux/nvme.h    |  6 ++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 50d9a20568a2..25456d02eddb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -23,6 +23,7 @@
 #include <linux/t10-pi.h>
 #include <linux/types.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
 #include <linux/sed-opal.h>
 #include <linux/pci-p2pdma.h>
 
@@ -1795,6 +1796,9 @@ static void nvme_map_cmb(struct nvme_dev *dev)
 	if (dev->cmb_size)
 		return;
 
+	if (NVME_CAP_CMBS(dev->ctrl.cap))
+		writel(NVME_CMBMSC_CRE, dev->bar + NVME_REG_CMBMSC);
+
 	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
 	if (!dev->cmbsz)
 		return;
@@ -1808,6 +1812,16 @@ static void nvme_map_cmb(struct nvme_dev *dev)
 	if (offset > bar_size)
 		return;
 
+	/*
+	 * Tell the controller about the host side address mapping the CMB,
+	 * and enable CMB decoding for the NVMe 1.4+ scheme:
+	 */
+	if (NVME_CAP_CMBS(dev->ctrl.cap)) {
+		hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE |
+			     (pci_bus_address(pdev, bar) + offset),
+			     dev->bar + NVME_REG_CMBMSC);
+	}
+
 	/*
 	 * Controllers may support a CMB size larger than their BAR,
 	 * for example, due to being behind a bridge. Reduce the CMB to
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d92535997687..bfed36e342cc 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -116,6 +116,9 @@ enum {
 	NVME_REG_BPMBL	= 0x0048,	/* Boot Partition Memory Buffer
 					 * Location
 					 */
+	NVME_REG_CMBMSC = 0x0050,	/* Controller Memory Buffer Memory
+					 * Space Control
+					 */
 	NVME_REG_PMRCAP	= 0x0e00,	/* Persistent Memory Capabilities */
 	NVME_REG_PMRCTL	= 0x0e04,	/* Persistent Memory Region Control */
 	NVME_REG_PMRSTS	= 0x0e08,	/* Persistent Memory Region Status */
@@ -135,6 +138,7 @@ enum {
 #define NVME_CAP_CSS(cap)	(((cap) >> 37) & 0xff)
 #define NVME_CAP_MPSMIN(cap)	(((cap) >> 48) & 0xf)
 #define NVME_CAP_MPSMAX(cap)	(((cap) >> 52) & 0xf)
+#define NVME_CAP_CMBS(cap)	(((cap) >> 57) & 0x1)
 
 #define NVME_CMB_BIR(cmbloc)	((cmbloc) & 0x7)
 #define NVME_CMB_OFST(cmbloc)	(((cmbloc) >> 12) & 0xfffff)
@@ -192,6 +196,8 @@ enum {
 	NVME_CSTS_SHST_OCCUR	= 1 << 2,
 	NVME_CSTS_SHST_CMPLT	= 2 << 2,
 	NVME_CSTS_SHST_MASK	= 3 << 2,
+	NVME_CMBMSC_CRE		= 1 << 0,
+	NVME_CMBMSC_CMSE	= 1 << 1,
 };
 
 struct nvme_id_power_state {
-- 
cgit v1.2.3


From 8eed01b5ca9c1deff329ad44f08e2041ca14842c Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 18 Jan 2021 16:06:55 +0100
Subject: mdio-bitbang: Export mdiobb_{read,write}()

Export mdiobb_read() and mdiobb_write(), so Ethernet controller drivers
can call them from their MDIO read/write wrappers.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Tested-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/mdio/mdio-bitbang.c | 6 ++++--
 include/linux/mdio-bitbang.h    | 3 +++
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/mdio/mdio-bitbang.c b/drivers/net/mdio/mdio-bitbang.c
index 5136275c8e73..d3915f831854 100644
--- a/drivers/net/mdio/mdio-bitbang.c
+++ b/drivers/net/mdio/mdio-bitbang.c
@@ -149,7 +149,7 @@ static int mdiobb_cmd_addr(struct mdiobb_ctrl *ctrl, int phy, u32 addr)
 	return dev_addr;
 }
 
-static int mdiobb_read(struct mii_bus *bus, int phy, int reg)
+int mdiobb_read(struct mii_bus *bus, int phy, int reg)
 {
 	struct mdiobb_ctrl *ctrl = bus->priv;
 	int ret, i;
@@ -180,8 +180,9 @@ static int mdiobb_read(struct mii_bus *bus, int phy, int reg)
 	mdiobb_get_bit(ctrl);
 	return ret;
 }
+EXPORT_SYMBOL(mdiobb_read);
 
-static int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val)
+int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val)
 {
 	struct mdiobb_ctrl *ctrl = bus->priv;
 
@@ -201,6 +202,7 @@ static int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val)
 	mdiobb_get_bit(ctrl);
 	return 0;
 }
+EXPORT_SYMBOL(mdiobb_write);
 
 struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl)
 {
diff --git a/include/linux/mdio-bitbang.h b/include/linux/mdio-bitbang.h
index 5d71e8a8500f..aca4dc037b70 100644
--- a/include/linux/mdio-bitbang.h
+++ b/include/linux/mdio-bitbang.h
@@ -35,6 +35,9 @@ struct mdiobb_ctrl {
 	const struct mdiobb_ops *ops;
 };
 
+int mdiobb_read(struct mii_bus *bus, int phy, int reg);
+int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val);
+
 /* The returned bus is not yet registered with the phy layer. */
 struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl);
 
-- 
cgit v1.2.3


From de658a195ee23ca6aaffe197d1d2ea040beea0a2 Mon Sep 17 00:00:00 2001
From: Grant Grundler <grundler@chromium.org>
Date: Tue, 19 Jan 2021 17:12:08 -0800
Subject: net: usb: cdc_ncm: don't spew notifications

RTL8156 sends notifications about every 32ms.
Only display/log notifications when something changes.

This issue has been reported by others:
	https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1832472
	https://lkml.org/lkml/2020/8/27/1083

...
[785962.779840] usb 1-1: new high-speed USB device number 5 using xhci_hcd
[785962.929944] usb 1-1: New USB device found, idVendor=0bda, idProduct=8156, bcdDevice=30.00
[785962.929949] usb 1-1: New USB device strings: Mfr=1, Product=2, SerialNumber=6
[785962.929952] usb 1-1: Product: USB 10/100/1G/2.5G LAN
[785962.929954] usb 1-1: Manufacturer: Realtek
[785962.929956] usb 1-1: SerialNumber: 000000001
[785962.991755] usbcore: registered new interface driver cdc_ether
[785963.017068] cdc_ncm 1-1:2.0: MAC-Address: 00:24:27:88:08:15
[785963.017072] cdc_ncm 1-1:2.0: setting rx_max = 16384
[785963.017169] cdc_ncm 1-1:2.0: setting tx_max = 16384
[785963.017682] cdc_ncm 1-1:2.0 usb0: register 'cdc_ncm' at usb-0000:00:14.0-1, CDC NCM, 00:24:27:88:08:15
[785963.019211] usbcore: registered new interface driver cdc_ncm
[785963.023856] usbcore: registered new interface driver cdc_wdm
[785963.025461] usbcore: registered new interface driver cdc_mbim
[785963.038824] cdc_ncm 1-1:2.0 enx002427880815: renamed from usb0
[785963.089586] cdc_ncm 1-1:2.0 enx002427880815: network connection: disconnected
[785963.121673] cdc_ncm 1-1:2.0 enx002427880815: network connection: disconnected
[785963.153682] cdc_ncm 1-1:2.0 enx002427880815: network connection: disconnected
...

This is about 2KB per second and will overwrite all contents of a 1MB
dmesg buffer in under 10 minutes rendering them useless for debugging
many kernel problems.

This is also an extra 180 MB/day in /var/logs (or 1GB per week) rendering
the majority of those logs useless too.

When the link is up (expected state), spew amount is >2x higher:
...
[786139.600992] cdc_ncm 2-1:2.0 enx002427880815: network connection: connected
[786139.632997] cdc_ncm 2-1:2.0 enx002427880815: 2500 mbit/s downlink 2500 mbit/s uplink
[786139.665097] cdc_ncm 2-1:2.0 enx002427880815: network connection: connected
[786139.697100] cdc_ncm 2-1:2.0 enx002427880815: 2500 mbit/s downlink 2500 mbit/s uplink
[786139.729094] cdc_ncm 2-1:2.0 enx002427880815: network connection: connected
[786139.761108] cdc_ncm 2-1:2.0 enx002427880815: 2500 mbit/s downlink 2500 mbit/s uplink
...

Chrome OS cannot support RTL8156 until this is fixed.

Signed-off-by: Grant Grundler <grundler@chromium.org>
Reviewed-by: Hayes Wang <hayeswang@realtek.com>
Link: https://lore.kernel.org/r/20210120011208.3768105-1-grundler@chromium.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/cdc_ncm.c  | 12 +++++++++++-
 include/linux/usb/usbnet.h |  2 ++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 5a78848db93f..291e76d32abe 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -1827,6 +1827,15 @@ cdc_ncm_speed_change(struct usbnet *dev,
 	uint32_t rx_speed = le32_to_cpu(data->DLBitRRate);
 	uint32_t tx_speed = le32_to_cpu(data->ULBitRate);
 
+	/* if the speed hasn't changed, don't report it.
+	 * RTL8156 shipped before 2021 sends notification about every 32ms.
+	 */
+	if (dev->rx_speed == rx_speed && dev->tx_speed == tx_speed)
+		return;
+
+	dev->rx_speed = rx_speed;
+	dev->tx_speed = tx_speed;
+
 	/*
 	 * Currently the USB-NET API does not support reporting the actual
 	 * device speed. Do print it instead.
@@ -1867,7 +1876,8 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb)
 		 * USB_CDC_NOTIFY_NETWORK_CONNECTION notification shall be
 		 * sent by device after USB_CDC_NOTIFY_SPEED_CHANGE.
 		 */
-		usbnet_link_change(dev, !!event->wValue, 0);
+		if (netif_carrier_ok(dev->net) != !!event->wValue)
+			usbnet_link_change(dev, !!event->wValue, 0);
 		break;
 
 	case USB_CDC_NOTIFY_SPEED_CHANGE:
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 88a7673894d5..cfbfd6fe01df 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -81,6 +81,8 @@ struct usbnet {
 #		define EVENT_LINK_CHANGE	11
 #		define EVENT_SET_RX_MODE	12
 #		define EVENT_NO_IP_ALIGN	13
+	u32			rx_speed;	/* in bps - NOT Mbps */
+	u32			tx_speed;	/* in bps - NOT Mbps */
 };
 
 static inline struct usb_driver *driver_of(struct usb_interface *intf)
-- 
cgit v1.2.3


From 51dfb6ca3728bd0a0a3c23776a12d2a15a1d2457 Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Wed, 20 Jan 2021 23:58:44 +0300
Subject: regulator: consumer: Add missing stubs to regulator/consumer.h

Add missing stubs to regulator/consumer.h in order to fix COMPILE_TEST
of the kernel. In particular this should fix compile-testing of OPP core
because of a missing stub for regulator_sync_voltage().

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Link: https://lore.kernel.org/r/20210120205844.12658-1-digetx@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/consumer.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 2024944fd2f7..20e84a84fb77 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -331,6 +331,12 @@ regulator_get_exclusive(struct device *dev, const char *id)
 	return ERR_PTR(-ENODEV);
 }
 
+static inline struct regulator *__must_check
+devm_regulator_get_exclusive(struct device *dev, const char *id)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline struct regulator *__must_check
 regulator_get_optional(struct device *dev, const char *id)
 {
@@ -486,6 +492,11 @@ static inline int regulator_get_voltage(struct regulator *regulator)
 	return -EINVAL;
 }
 
+static inline int regulator_sync_voltage(struct regulator *regulator)
+{
+	return -EINVAL;
+}
+
 static inline int regulator_is_supported_voltage(struct regulator *regulator,
 				   int min_uV, int max_uV)
 {
@@ -578,6 +589,25 @@ static inline int devm_regulator_unregister_notifier(struct regulator *regulator
 	return 0;
 }
 
+static inline int regulator_suspend_enable(struct regulator_dev *rdev,
+					   suspend_state_t state)
+{
+	return -EINVAL;
+}
+
+static inline int regulator_suspend_disable(struct regulator_dev *rdev,
+					    suspend_state_t state)
+{
+	return -EINVAL;
+}
+
+static inline int regulator_set_suspend_voltage(struct regulator *regulator,
+						int min_uV, int max_uV,
+						suspend_state_t state)
+{
+	return -EINVAL;
+}
+
 static inline void *regulator_get_drvdata(struct regulator *regulator)
 {
 	return NULL;
-- 
cgit v1.2.3


From e020ff611ba9be54e959e6b548038f8a020da1c9 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Sun, 10 Jan 2021 09:54:07 -0800
Subject: driver core: Fix device link device name collision

The device link device's name was of the form:
<supplier-dev-name>--<consumer-dev-name>

This can cause name collision as reported here [1] as device names are
not globally unique. Since device names have to be unique within the
bus/class, add the bus/class name as a prefix to the device names used to
construct the device link device name.

So the devuce link device's name will be of the form:
<supplier-bus-name>:<supplier-dev-name>--<consumer-bus-name>:<consumer-dev-name>

[1] - https://lore.kernel.org/lkml/20201229033440.32142-1-michael@walle.cc/

Fixes: 287905e68dd2 ("driver core: Expose device link details in sysfs")
Cc: stable@vger.kernel.org
Reported-by: Michael Walle <michael@walle.cc>
Tested-by: Michael Walle <michael@walle.cc>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Link: https://lore.kernel.org/r/20210110175408.1465657-1-saravanak@google.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-devlink    |  4 ++--
 Documentation/ABI/testing/sysfs-devices-consumer |  5 +++--
 Documentation/ABI/testing/sysfs-devices-supplier |  5 +++--
 drivers/base/core.c                              | 27 +++++++++++++-----------
 include/linux/device.h                           | 12 +++++++++++
 5 files changed, 35 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-devlink b/Documentation/ABI/testing/sysfs-class-devlink
index b662f747c83e..8a21ce515f61 100644
--- a/Documentation/ABI/testing/sysfs-class-devlink
+++ b/Documentation/ABI/testing/sysfs-class-devlink
@@ -5,8 +5,8 @@ Description:
 		Provide a place in sysfs for the device link objects in the
 		kernel at any given time.  The name of a device link directory,
 		denoted as ... above, is of the form <supplier>--<consumer>
-		where <supplier> is the supplier device name and <consumer> is
-		the consumer device name.
+		where <supplier> is the supplier bus:device name and <consumer>
+		is the consumer bus:device name.
 
 What:		/sys/class/devlink/.../auto_remove_on
 Date:		May 2020
diff --git a/Documentation/ABI/testing/sysfs-devices-consumer b/Documentation/ABI/testing/sysfs-devices-consumer
index 1f06d74d1c3c..0809fda092e6 100644
--- a/Documentation/ABI/testing/sysfs-devices-consumer
+++ b/Documentation/ABI/testing/sysfs-devices-consumer
@@ -4,5 +4,6 @@ Contact:	Saravana Kannan <saravanak@google.com>
 Description:
 		The /sys/devices/.../consumer:<consumer> are symlinks to device
 		links where this device is the supplier. <consumer> denotes the
-		name of the consumer in that device link. There can be zero or
-		more of these symlinks for a given device.
+		name of the consumer in that device link and is of the form
+		bus:device name. There can be zero or more of these symlinks
+		for a given device.
diff --git a/Documentation/ABI/testing/sysfs-devices-supplier b/Documentation/ABI/testing/sysfs-devices-supplier
index a919e0db5e90..207f5972e98d 100644
--- a/Documentation/ABI/testing/sysfs-devices-supplier
+++ b/Documentation/ABI/testing/sysfs-devices-supplier
@@ -4,5 +4,6 @@ Contact:	Saravana Kannan <saravanak@google.com>
 Description:
 		The /sys/devices/.../supplier:<supplier> are symlinks to device
 		links where this device is the consumer. <supplier> denotes the
-		name of the supplier in that device link. There can be zero or
-		more of these symlinks for a given device.
+		name of the supplier in that device link and is of the form
+		bus:device name. There can be zero or more of these symlinks
+		for a given device.
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 3819fd012e27..78f6db169d47 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -471,7 +471,9 @@ static int devlink_add_symlinks(struct device *dev,
 	struct device *con = link->consumer;
 	char *buf;
 
-	len = max(strlen(dev_name(sup)), strlen(dev_name(con)));
+	len = max(strlen(dev_bus_name(sup)) + strlen(dev_name(sup)),
+		  strlen(dev_bus_name(con)) + strlen(dev_name(con)));
+	len += strlen(":");
 	len += strlen("supplier:") + 1;
 	buf = kzalloc(len, GFP_KERNEL);
 	if (!buf)
@@ -485,12 +487,12 @@ static int devlink_add_symlinks(struct device *dev,
 	if (ret)
 		goto err_con;
 
-	snprintf(buf, len, "consumer:%s", dev_name(con));
+	snprintf(buf, len, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
 	ret = sysfs_create_link(&sup->kobj, &link->link_dev.kobj, buf);
 	if (ret)
 		goto err_con_dev;
 
-	snprintf(buf, len, "supplier:%s", dev_name(sup));
+	snprintf(buf, len, "supplier:%s:%s", dev_bus_name(sup), dev_name(sup));
 	ret = sysfs_create_link(&con->kobj, &link->link_dev.kobj, buf);
 	if (ret)
 		goto err_sup_dev;
@@ -498,7 +500,7 @@ static int devlink_add_symlinks(struct device *dev,
 	goto out;
 
 err_sup_dev:
-	snprintf(buf, len, "consumer:%s", dev_name(con));
+	snprintf(buf, len, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
 	sysfs_remove_link(&sup->kobj, buf);
 err_con_dev:
 	sysfs_remove_link(&link->link_dev.kobj, "consumer");
@@ -521,7 +523,9 @@ static void devlink_remove_symlinks(struct device *dev,
 	sysfs_remove_link(&link->link_dev.kobj, "consumer");
 	sysfs_remove_link(&link->link_dev.kobj, "supplier");
 
-	len = max(strlen(dev_name(sup)), strlen(dev_name(con)));
+	len = max(strlen(dev_bus_name(sup)) + strlen(dev_name(sup)),
+		  strlen(dev_bus_name(con)) + strlen(dev_name(con)));
+	len += strlen(":");
 	len += strlen("supplier:") + 1;
 	buf = kzalloc(len, GFP_KERNEL);
 	if (!buf) {
@@ -529,9 +533,9 @@ static void devlink_remove_symlinks(struct device *dev,
 		return;
 	}
 
-	snprintf(buf, len, "supplier:%s", dev_name(sup));
+	snprintf(buf, len, "supplier:%s:%s", dev_bus_name(sup), dev_name(sup));
 	sysfs_remove_link(&con->kobj, buf);
-	snprintf(buf, len, "consumer:%s", dev_name(con));
+	snprintf(buf, len, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
 	sysfs_remove_link(&sup->kobj, buf);
 	kfree(buf);
 }
@@ -752,8 +756,9 @@ struct device_link *device_link_add(struct device *consumer,
 
 	link->link_dev.class = &devlink_class;
 	device_set_pm_not_required(&link->link_dev);
-	dev_set_name(&link->link_dev, "%s--%s",
-		     dev_name(supplier), dev_name(consumer));
+	dev_set_name(&link->link_dev, "%s:%s--%s:%s",
+		     dev_bus_name(supplier), dev_name(supplier),
+		     dev_bus_name(consumer), dev_name(consumer));
 	if (device_register(&link->link_dev)) {
 		put_device(consumer);
 		put_device(supplier);
@@ -1823,9 +1828,7 @@ const char *dev_driver_string(const struct device *dev)
 	 * never change once they are set, so they don't need special care.
 	 */
 	drv = READ_ONCE(dev->driver);
-	return drv ? drv->name :
-			(dev->bus ? dev->bus->name :
-			(dev->class ? dev->class->name : ""));
+	return drv ? drv->name : dev_bus_name(dev);
 }
 EXPORT_SYMBOL(dev_driver_string);
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 89bb8b84173e..1779f90eeb4c 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -609,6 +609,18 @@ static inline const char *dev_name(const struct device *dev)
 	return kobject_name(&dev->kobj);
 }
 
+/**
+ * dev_bus_name - Return a device's bus/class name, if at all possible
+ * @dev: struct device to get the bus/class name of
+ *
+ * Will return the name of the bus/class the device is attached to.  If it is
+ * not attached to a bus/class, an empty string will be returned.
+ */
+static inline const char *dev_bus_name(const struct device *dev)
+{
+	return dev->bus ? dev->bus->name : (dev->class ? dev->class->name : "");
+}
+
 __printf(2, 3) int dev_set_name(struct device *dev, const char *name, ...);
 
 #ifdef CONFIG_NUMA
-- 
cgit v1.2.3


From ac687e6e8c26181a33270efd1a2e2241377924b0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 12 Jan 2021 11:24:04 +0100
Subject: kthread: Extract KTHREAD_IS_PER_CPU

There is a need to distinguish geniune per-cpu kthreads from kthreads
that happen to have a single CPU affinity.

Geniune per-cpu kthreads are kthreads that are CPU affine for
correctness, these will obviously have PF_KTHREAD set, but must also
have PF_NO_SETAFFINITY set, lest userspace modify their affinity and
ruins things.

However, these two things are not sufficient, PF_NO_SETAFFINITY is
also set on other tasks that have their affinities controlled through
other means, like for instance workqueues.

Therefore another bit is needed; it turns out kthread_create_per_cpu()
already has such a bit: KTHREAD_IS_PER_CPU, which is used to make
kthread_park()/kthread_unpark() work correctly.

Expose this flag and remove the implicit setting of it from
kthread_create_on_cpu(); the io_uring usage of it seems dubious at
best.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Tested-by: Valentin Schneider <valentin.schneider@arm.com>
Link: https://lkml.kernel.org/r/20210121103506.557620262@infradead.org
---
 include/linux/kthread.h |  3 +++
 kernel/kthread.c        | 27 ++++++++++++++++++++++++++-
 kernel/smpboot.c        |  1 +
 3 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 65b81e0c494d..2484ed97e72f 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -33,6 +33,9 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 					  unsigned int cpu,
 					  const char *namefmt);
 
+void kthread_set_per_cpu(struct task_struct *k, int cpu);
+bool kthread_is_per_cpu(struct task_struct *k);
+
 /**
  * kthread_run - create and wake a thread.
  * @threadfn: the function to run until signal_pending(current).
diff --git a/kernel/kthread.c b/kernel/kthread.c
index a5eceecd4513..e0e4a423f184 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -493,11 +493,36 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 		return p;
 	kthread_bind(p, cpu);
 	/* CPU hotplug need to bind once again when unparking the thread. */
-	set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
 	to_kthread(p)->cpu = cpu;
 	return p;
 }
 
+void kthread_set_per_cpu(struct task_struct *k, int cpu)
+{
+	struct kthread *kthread = to_kthread(k);
+	if (!kthread)
+		return;
+
+	WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));
+
+	if (cpu < 0) {
+		clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
+		return;
+	}
+
+	kthread->cpu = cpu;
+	set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
+}
+
+bool kthread_is_per_cpu(struct task_struct *k)
+{
+	struct kthread *kthread = to_kthread(k);
+	if (!kthread)
+		return false;
+
+	return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
+}
+
 /**
  * kthread_unpark - unpark a thread created by kthread_create().
  * @k:		thread created by kthread_create().
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 2efe1e206167..f25208e8df83 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -188,6 +188,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
 		kfree(td);
 		return PTR_ERR(tsk);
 	}
+	kthread_set_per_cpu(tsk, cpu);
 	/*
 	 * Park the thread so that it could start right on the CPU
 	 * when it is available.
-- 
cgit v1.2.3


From 9f12e37cae44a96132fc3031535a0b165486941a Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Mon, 25 Jan 2021 11:09:25 -0800
Subject: Commit 9bb48c82aced ("tty: implement write_iter") converted the tty
 layer to use write_iter. Fix the redirected_tty_write declaration also in
 n_tty and change the comparisons to use write_iter instead of write.

[ Also moved the declaration of redirected_tty_write() to the proper
  location in a header file. The reason for the bug was the bogus extern
  declaration in n_tty.c silently not matching the changed definition in
  tty_io.c, and because it wasn't in a shared header file, there was no
  cross-checking of the declaration.

  Sami noticed because Clang's Control Flow Integrity checking ended up
  incidentally noticing the inconsistent declaration.    - Linus ]

Fixes: 9bb48c82aced ("tty: implement write_iter")
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/n_tty.c  | 7 ++-----
 drivers/tty/tty_io.c | 2 --
 include/linux/tty.h  | 1 +
 3 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 319d68c8a5df..219e85756171 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -2081,9 +2081,6 @@ static int canon_copy_from_read_buf(struct tty_struct *tty,
 	return 0;
 }
 
-extern ssize_t redirected_tty_write(struct file *, const char __user *,
-							size_t, loff_t *);
-
 /**
  *	job_control		-	check job control
  *	@tty: tty
@@ -2105,7 +2102,7 @@ static int job_control(struct tty_struct *tty, struct file *file)
 	/* NOTE: not yet done after every sleep pending a thorough
 	   check of the logic of this change. -- jlc */
 	/* don't stop on /dev/console */
-	if (file->f_op->write == redirected_tty_write)
+	if (file->f_op->write_iter == redirected_tty_write)
 		return 0;
 
 	return __tty_check_change(tty, SIGTTIN);
@@ -2309,7 +2306,7 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file,
 	ssize_t retval = 0;
 
 	/* Job control check -- must be done at start (POSIX.1 7.1.1.4). */
-	if (L_TOSTOP(tty) && file->f_op->write != redirected_tty_write) {
+	if (L_TOSTOP(tty) && file->f_op->write_iter != redirected_tty_write) {
 		retval = tty_check_change(tty);
 		if (retval)
 			return retval;
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 4a208a95e921..48de20916ca7 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -144,10 +144,8 @@ DEFINE_MUTEX(tty_mutex);
 
 static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *);
 static ssize_t tty_write(struct kiocb *, struct iov_iter *);
-ssize_t redirected_tty_write(struct kiocb *, struct iov_iter *);
 static __poll_t tty_poll(struct file *, poll_table *);
 static int tty_open(struct inode *, struct file *);
-long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 #ifdef CONFIG_COMPAT
 static long tty_compat_ioctl(struct file *file, unsigned int cmd,
 				unsigned long arg);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index c873f475f0a7..37803f3e6d49 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -421,6 +421,7 @@ extern void tty_kclose(struct tty_struct *tty);
 extern int tty_dev_name_to_number(const char *name, dev_t *number);
 extern int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout);
 extern void tty_ldisc_unlock(struct tty_struct *tty);
+extern ssize_t redirected_tty_write(struct kiocb *, struct iov_iter *);
 #else
 static inline void tty_kref_put(struct tty_struct *tty)
 { }
-- 
cgit v1.2.3