From 4365a5676fa3aa1d5ae6c90c22a0044f09ba584e Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hioryu@jp.fujitsu.com>
Date: Tue, 15 Dec 2009 16:45:33 -0800
Subject: oom-kill: fix NUMA constraint check with nodemask

Fix node-oriented allocation handling in oom-kill.c I myself think of this
as a bugfix not as an ehnancement.

In these days, things are changed as
  - alloc_pages() eats nodemask as its arguments, __alloc_pages_nodemask().
  - mempolicy don't maintain its own private zonelists.
  (And cpuset doesn't use nodemask for __alloc_pages_nodemask())

So, current oom-killer's check function is wrong.

This patch does
  - check nodemask, if nodemask && nodemask doesn't cover all
    node_states[N_HIGH_MEMORY], this is CONSTRAINT_MEMORY_POLICY.
  - Scan all zonelist under nodemask, if it hits cpuset's wall
    this faiulre is from cpuset.
And
  - modifies the caller of out_of_memory not to call oom if __GFP_THISNODE.
    This doesn't change "current" behavior. If callers use __GFP_THISNODE
    it should handle "page allocation failure" by itself.

  - handle __GFP_NOFAIL+__GFP_THISNODE path.
    This is something like a FIXME but this gfpmask is not used now.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hioryu@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6aac5fe4f6f1..537662315627 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -10,6 +10,7 @@
 #ifdef __KERNEL__
 
 #include <linux/types.h>
+#include <linux/nodemask.h>
 
 struct zonelist;
 struct notifier_block;
@@ -26,7 +27,8 @@ enum oom_constraint {
 extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
 
-extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
+extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+		int order, nodemask_t *mask);
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 
-- 
cgit v1.2.3


From 35570ac6039ef490b9c5abde1fee4803a39bf4e1 Mon Sep 17 00:00:00 2001
From: Richard Röjfors <richard.rojfors@mocean-labs.com>
Date: Tue, 15 Dec 2009 16:46:18 -0800
Subject: gpio: add GPIO driver for the Timberdale FPGA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A GPIO driver for the Timberdale FPGA found on the Intel Atom board
Russellville.

The GPIO driver also has an IRQ-chip to support interrupts on the pins.

Signed-off-by: Richard Röjfors <richard.rojfors@mocean-labs.com>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpio/Kconfig      |   6 +
 drivers/gpio/Makefile     |   1 +
 drivers/gpio/timbgpio.c   | 342 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/timb_gpio.h |  37 +++++
 4 files changed, 386 insertions(+)
 create mode 100644 drivers/gpio/timbgpio.c
 create mode 100644 include/linux/timb_gpio.h

(limited to 'include/linux')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 57ca339924ef..a019b49ecc9b 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -206,6 +206,12 @@ config GPIO_LANGWELL
 	help
 	  Say Y here to support Intel Moorestown platform GPIO.
 
+config GPIO_TIMBERDALE
+	bool "Support for timberdale GPIO IP"
+	depends on MFD_TIMBERDALE && GPIOLIB && HAS_IOMEM
+	---help---
+	Add support for the GPIO IP in the timberdale FPGA.
+
 comment "SPI GPIO expanders:"
 
 config GPIO_MAX7301
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 270b6d7839f5..52fe4cf734c7 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_GPIO_MCP23S08)	+= mcp23s08.o
 obj-$(CONFIG_GPIO_PCA953X)	+= pca953x.o
 obj-$(CONFIG_GPIO_PCF857X)	+= pcf857x.o
 obj-$(CONFIG_GPIO_PL061)	+= pl061.o
+obj-$(CONFIG_GPIO_TIMBERDALE)	+= timbgpio.o
 obj-$(CONFIG_GPIO_TWL4030)	+= twl4030-gpio.o
 obj-$(CONFIG_GPIO_UCB1400)	+= ucb1400_gpio.o
 obj-$(CONFIG_GPIO_XILINX)	+= xilinx_gpio.o
diff --git a/drivers/gpio/timbgpio.c b/drivers/gpio/timbgpio.c
new file mode 100644
index 000000000000..a4d344ba8e5c
--- /dev/null
+++ b/drivers/gpio/timbgpio.c
@@ -0,0 +1,342 @@
+/*
+ * timbgpio.c timberdale FPGA GPIO driver
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Timberdale FPGA GPIO
+ */
+
+#include <linux/module.h>
+#include <linux/gpio.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
+#include <linux/timb_gpio.h>
+#include <linux/interrupt.h>
+
+#define DRIVER_NAME "timb-gpio"
+
+#define TGPIOVAL	0x00
+#define TGPIODIR	0x04
+#define TGPIO_IER	0x08
+#define TGPIO_ISR	0x0c
+#define TGPIO_IPR	0x10
+#define TGPIO_ICR	0x14
+#define TGPIO_FLR	0x18
+#define TGPIO_LVR	0x1c
+
+struct timbgpio {
+	void __iomem		*membase;
+	spinlock_t		lock; /* mutual exclusion */
+	struct gpio_chip	gpio;
+	int			irq_base;
+};
+
+static int timbgpio_update_bit(struct gpio_chip *gpio, unsigned index,
+	unsigned offset, bool enabled)
+{
+	struct timbgpio *tgpio = container_of(gpio, struct timbgpio, gpio);
+	u32 reg;
+
+	spin_lock(&tgpio->lock);
+	reg = ioread32(tgpio->membase + offset);
+
+	if (enabled)
+		reg |= (1 << index);
+	else
+		reg &= ~(1 << index);
+
+	iowrite32(reg, tgpio->membase + offset);
+	spin_unlock(&tgpio->lock);
+
+	return 0;
+}
+
+static int timbgpio_gpio_direction_input(struct gpio_chip *gpio, unsigned nr)
+{
+	return timbgpio_update_bit(gpio, nr, TGPIODIR, true);
+}
+
+static int timbgpio_gpio_get(struct gpio_chip *gpio, unsigned nr)
+{
+	struct timbgpio *tgpio = container_of(gpio, struct timbgpio, gpio);
+	u32 value;
+
+	value = ioread32(tgpio->membase + TGPIOVAL);
+	return (value & (1 << nr)) ? 1 : 0;
+}
+
+static int timbgpio_gpio_direction_output(struct gpio_chip *gpio,
+						unsigned nr, int val)
+{
+	return timbgpio_update_bit(gpio, nr, TGPIODIR, false);
+}
+
+static void timbgpio_gpio_set(struct gpio_chip *gpio,
+				unsigned nr, int val)
+{
+	timbgpio_update_bit(gpio, nr, TGPIOVAL, val != 0);
+}
+
+static int timbgpio_to_irq(struct gpio_chip *gpio, unsigned offset)
+{
+	struct timbgpio *tgpio = container_of(gpio, struct timbgpio, gpio);
+
+	if (tgpio->irq_base <= 0)
+		return -EINVAL;
+
+	return tgpio->irq_base + offset;
+}
+
+/*
+ * GPIO IRQ
+ */
+static void timbgpio_irq_disable(unsigned irq)
+{
+	struct timbgpio *tgpio = get_irq_chip_data(irq);
+	int offset = irq - tgpio->irq_base;
+
+	timbgpio_update_bit(&tgpio->gpio, offset, TGPIO_IER, 0);
+}
+
+static void timbgpio_irq_enable(unsigned irq)
+{
+	struct timbgpio *tgpio = get_irq_chip_data(irq);
+	int offset = irq - tgpio->irq_base;
+
+	timbgpio_update_bit(&tgpio->gpio, offset, TGPIO_IER, 1);
+}
+
+static int timbgpio_irq_type(unsigned irq, unsigned trigger)
+{
+	struct timbgpio *tgpio = get_irq_chip_data(irq);
+	int offset = irq - tgpio->irq_base;
+	unsigned long flags;
+	u32 lvr, flr;
+
+	if (offset < 0 || offset > tgpio->gpio.ngpio)
+		return -EINVAL;
+
+	spin_lock_irqsave(&tgpio->lock, flags);
+
+	lvr = ioread32(tgpio->membase + TGPIO_LVR);
+	flr = ioread32(tgpio->membase + TGPIO_FLR);
+
+	if (trigger & (IRQ_TYPE_LEVEL_HIGH | IRQ_TYPE_LEVEL_LOW)) {
+		flr &= ~(1 << offset);
+		if (trigger & IRQ_TYPE_LEVEL_HIGH)
+			lvr |= 1 << offset;
+		else
+			lvr &= ~(1 << offset);
+	}
+
+	if ((trigger & IRQ_TYPE_EDGE_BOTH) == IRQ_TYPE_EDGE_BOTH)
+		return -EINVAL;
+	else {
+		flr |= 1 << offset;
+		/* opposite compared to the datasheet, but it mirrors the
+		 * reality
+		 */
+		if (trigger & IRQ_TYPE_EDGE_FALLING)
+			lvr |= 1 << offset;
+		else
+			lvr &= ~(1 << offset);
+	}
+
+	iowrite32(lvr, tgpio->membase + TGPIO_LVR);
+	iowrite32(flr, tgpio->membase + TGPIO_FLR);
+	iowrite32(1 << offset, tgpio->membase + TGPIO_ICR);
+	spin_unlock_irqrestore(&tgpio->lock, flags);
+
+	return 0;
+}
+
+static void timbgpio_irq(unsigned int irq, struct irq_desc *desc)
+{
+	struct timbgpio *tgpio = get_irq_data(irq);
+	unsigned long ipr;
+	int offset;
+
+	desc->chip->ack(irq);
+	ipr = ioread32(tgpio->membase + TGPIO_IPR);
+	iowrite32(ipr, tgpio->membase + TGPIO_ICR);
+
+	for_each_bit(offset, &ipr, tgpio->gpio.ngpio)
+		generic_handle_irq(timbgpio_to_irq(&tgpio->gpio, offset));
+}
+
+static struct irq_chip timbgpio_irqchip = {
+	.name		= "GPIO",
+	.enable		= timbgpio_irq_enable,
+	.disable	= timbgpio_irq_disable,
+	.set_type	= timbgpio_irq_type,
+};
+
+static int __devinit timbgpio_probe(struct platform_device *pdev)
+{
+	int err, i;
+	struct gpio_chip *gc;
+	struct timbgpio *tgpio;
+	struct resource *iomem;
+	struct timbgpio_platform_data *pdata = pdev->dev.platform_data;
+	int irq = platform_get_irq(pdev, 0);
+
+	if (!pdata || pdata->nr_pins > 32) {
+		err = -EINVAL;
+		goto err_mem;
+	}
+
+	iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!iomem) {
+		err = -EINVAL;
+		goto err_mem;
+	}
+
+	tgpio = kzalloc(sizeof(*tgpio), GFP_KERNEL);
+	if (!tgpio) {
+		err = -EINVAL;
+		goto err_mem;
+	}
+	tgpio->irq_base = pdata->irq_base;
+
+	spin_lock_init(&tgpio->lock);
+
+	if (!request_mem_region(iomem->start, resource_size(iomem),
+		DRIVER_NAME)) {
+		err = -EBUSY;
+		goto err_request;
+	}
+
+	tgpio->membase = ioremap(iomem->start, resource_size(iomem));
+	if (!tgpio->membase) {
+		err = -ENOMEM;
+		goto err_ioremap;
+	}
+
+	gc = &tgpio->gpio;
+
+	gc->label = dev_name(&pdev->dev);
+	gc->owner = THIS_MODULE;
+	gc->dev = &pdev->dev;
+	gc->direction_input = timbgpio_gpio_direction_input;
+	gc->get = timbgpio_gpio_get;
+	gc->direction_output = timbgpio_gpio_direction_output;
+	gc->set = timbgpio_gpio_set;
+	gc->to_irq = (irq >= 0 && tgpio->irq_base > 0) ? timbgpio_to_irq : NULL;
+	gc->dbg_show = NULL;
+	gc->base = pdata->gpio_base;
+	gc->ngpio = pdata->nr_pins;
+	gc->can_sleep = 0;
+
+	err = gpiochip_add(gc);
+	if (err)
+		goto err_chipadd;
+
+	platform_set_drvdata(pdev, tgpio);
+
+	/* make sure to disable interrupts */
+	iowrite32(0x0, tgpio->membase + TGPIO_IER);
+
+	if (irq < 0 || tgpio->irq_base <= 0)
+		return 0;
+
+	for (i = 0; i < pdata->nr_pins; i++) {
+		set_irq_chip_and_handler_name(tgpio->irq_base + i,
+			&timbgpio_irqchip, handle_simple_irq, "mux");
+		set_irq_chip_data(tgpio->irq_base + i, tgpio);
+#ifdef CONFIG_ARM
+		set_irq_flags(tgpio->irq_base + i, IRQF_VALID | IRQF_PROBE);
+#endif
+	}
+
+	set_irq_data(irq, tgpio);
+	set_irq_chained_handler(irq, timbgpio_irq);
+
+	return 0;
+
+err_chipadd:
+	iounmap(tgpio->membase);
+err_ioremap:
+	release_mem_region(iomem->start, resource_size(iomem));
+err_request:
+	kfree(tgpio);
+err_mem:
+	printk(KERN_ERR DRIVER_NAME": Failed to register GPIOs: %d\n", err);
+
+	return err;
+}
+
+static int __devexit timbgpio_remove(struct platform_device *pdev)
+{
+	int err;
+	struct timbgpio_platform_data *pdata = pdev->dev.platform_data;
+	struct timbgpio *tgpio = platform_get_drvdata(pdev);
+	struct resource *iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	int irq = platform_get_irq(pdev, 0);
+
+	if (irq >= 0 && tgpio->irq_base > 0) {
+		int i;
+		for (i = 0; i < pdata->nr_pins; i++) {
+			set_irq_chip(tgpio->irq_base + i, NULL);
+			set_irq_chip_data(tgpio->irq_base + i, NULL);
+		}
+
+		set_irq_handler(irq, NULL);
+		set_irq_data(irq, NULL);
+	}
+
+	err = gpiochip_remove(&tgpio->gpio);
+	if (err)
+		printk(KERN_ERR DRIVER_NAME": failed to remove gpio_chip\n");
+
+	iounmap(tgpio->membase);
+	release_mem_region(iomem->start, resource_size(iomem));
+	kfree(tgpio);
+
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+static struct platform_driver timbgpio_platform_driver = {
+	.driver = {
+		.name	= DRIVER_NAME,
+		.owner	= THIS_MODULE,
+	},
+	.probe		= timbgpio_probe,
+	.remove		= timbgpio_remove,
+};
+
+/*--------------------------------------------------------------------------*/
+
+static int __init timbgpio_init(void)
+{
+	return platform_driver_register(&timbgpio_platform_driver);
+}
+
+static void __exit timbgpio_exit(void)
+{
+	platform_driver_unregister(&timbgpio_platform_driver);
+}
+
+module_init(timbgpio_init);
+module_exit(timbgpio_exit);
+
+MODULE_DESCRIPTION("Timberdale GPIO driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Mocean Laboratories");
+MODULE_ALIAS("platform:"DRIVER_NAME);
+
diff --git a/include/linux/timb_gpio.h b/include/linux/timb_gpio.h
new file mode 100644
index 000000000000..ce456eaae861
--- /dev/null
+++ b/include/linux/timb_gpio.h
@@ -0,0 +1,37 @@
+/*
+ * timb_gpio.h timberdale FPGA GPIO driver, platform data definition
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _LINUX_TIMB_GPIO_H
+#define _LINUX_TIMB_GPIO_H
+
+/**
+ * struct timbgpio_platform_data - Platform data of the Timberdale GPIO driver
+ * @gpio_base		The number of the first GPIO pin, set to -1 for
+ *			dynamic number allocation.
+ * @nr_pins		Number of pins that is supported by the hardware (1-32)
+ * @irq_base		If IRQ is supported by the hardware, this is the base
+ *			number of IRQ:s. One IRQ per pin will be used. Set to
+ *			-1 if IRQ:s is not supported.
+ */
+struct timbgpio_platform_data {
+	int gpio_base;
+	int nr_pins;
+	int irq_base;
+};
+
+#endif
-- 
cgit v1.2.3


From 0769746183caff9d4334be48c7b0e7d2ec8716c4 Mon Sep 17 00:00:00 2001
From: Jani Nikula <ext-jani.1.nikula@nokia.com>
Date: Tue, 15 Dec 2009 16:46:20 -0800
Subject: gpiolib: add support for changing value polarity in sysfs

Drivers may use gpiolib sysfs as part of their public user space
interface. The GPIO number and polarity might change from board to
board. The gpio_export_link() call can be used to hide the GPIO number
from user space. Add support for also hiding the GPIO line polarity
changes from user space.

Signed-off-by: Jani Nikula <ext-jani.1.nikula@nokia.com>
Cc: David Brownell <david-b@pacbell.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/gpio.txt     |  15 +++++
 drivers/gpio/gpiolib.c     | 161 +++++++++++++++++++++++++++++++++++++++++----
 include/asm-generic/gpio.h |   6 ++
 include/linux/gpio.h       |   6 ++
 4 files changed, 176 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/gpio.txt b/Documentation/gpio.txt
index e4e7daed2ba8..1866c27eec69 100644
--- a/Documentation/gpio.txt
+++ b/Documentation/gpio.txt
@@ -531,6 +531,13 @@ and have the following read/write attributes:
 		This file exists only if the pin can be configured as an
 		interrupt generating input pin.
 
+	"active_low" ... reads as either 0 (false) or 1 (true).  Write
+		any nonzero value to invert the value attribute both
+		for reading and writing.  Existing and subsequent
+		poll(2) support configuration via the edge attribute
+		for "rising" and "falling" edges will follow this
+		setting.
+
 GPIO controllers have paths like /sys/class/gpio/gpiochip42/ (for the
 controller implementing GPIOs starting at #42) and have the following
 read-only attributes:
@@ -566,6 +573,8 @@ requested using gpio_request():
 	int gpio_export_link(struct device *dev, const char *name,
 		unsigned gpio)
 
+	/* change the polarity of a GPIO node in sysfs */
+	int gpio_sysfs_set_active_low(unsigned gpio, int value);
 
 After a kernel driver requests a GPIO, it may only be made available in
 the sysfs interface by gpio_export().  The driver can control whether the
@@ -580,3 +589,9 @@ After the GPIO has been exported, gpio_export_link() allows creating
 symlinks from elsewhere in sysfs to the GPIO sysfs node.  Drivers can
 use this to provide the interface under their own device in sysfs with
 a descriptive name.
+
+Drivers can use gpio_sysfs_set_active_low() to hide GPIO line polarity
+differences between boards from user space.  This only affects the
+sysfs interface.  Polarity change can be done both before and after
+gpio_export(), and previously enabled poll(2) support for either
+rising or falling edge will be reconfigured to follow this setting.
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 50de0f5750d8..a25ad284a272 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -53,6 +53,7 @@ struct gpio_desc {
 #define FLAG_SYSFS	4	/* exported via /sys/class/gpio/control */
 #define FLAG_TRIG_FALL	5	/* trigger on falling edge */
 #define FLAG_TRIG_RISE	6	/* trigger on rising edge */
+#define FLAG_ACTIVE_LOW	7	/* sysfs value has active low */
 
 #define PDESC_ID_SHIFT	16	/* add new flags before this one */
 
@@ -210,6 +211,11 @@ static DEFINE_MUTEX(sysfs_lock);
  *      * configures behavior of poll(2) on /value
  *      * available only if pin can generate IRQs on input
  *      * is read/write as "none", "falling", "rising", or "both"
+ *   /active_low
+ *      * configures polarity of /value
+ *      * is read/write as zero/nonzero
+ *      * also affects existing and subsequent "falling" and "rising"
+ *        /edge configuration
  */
 
 static ssize_t gpio_direction_show(struct device *dev,
@@ -255,7 +261,7 @@ static ssize_t gpio_direction_store(struct device *dev,
 	return status ? : size;
 }
 
-static const DEVICE_ATTR(direction, 0644,
+static /* const */ DEVICE_ATTR(direction, 0644,
 		gpio_direction_show, gpio_direction_store);
 
 static ssize_t gpio_value_show(struct device *dev,
@@ -267,10 +273,17 @@ static ssize_t gpio_value_show(struct device *dev,
 
 	mutex_lock(&sysfs_lock);
 
-	if (!test_bit(FLAG_EXPORT, &desc->flags))
+	if (!test_bit(FLAG_EXPORT, &desc->flags)) {
 		status = -EIO;
-	else
-		status = sprintf(buf, "%d\n", !!gpio_get_value_cansleep(gpio));
+	} else {
+		int value;
+
+		value = !!gpio_get_value_cansleep(gpio);
+		if (test_bit(FLAG_ACTIVE_LOW, &desc->flags))
+			value = !value;
+
+		status = sprintf(buf, "%d\n", value);
+	}
 
 	mutex_unlock(&sysfs_lock);
 	return status;
@@ -294,6 +307,8 @@ static ssize_t gpio_value_store(struct device *dev,
 
 		status = strict_strtol(buf, 0, &value);
 		if (status == 0) {
+			if (test_bit(FLAG_ACTIVE_LOW, &desc->flags))
+				value = !value;
 			gpio_set_value_cansleep(gpio, value != 0);
 			status = size;
 		}
@@ -303,7 +318,7 @@ static ssize_t gpio_value_store(struct device *dev,
 	return status;
 }
 
-static /*const*/ DEVICE_ATTR(value, 0644,
+static const DEVICE_ATTR(value, 0644,
 		gpio_value_show, gpio_value_store);
 
 static irqreturn_t gpio_sysfs_irq(int irq, void *priv)
@@ -352,9 +367,11 @@ static int gpio_setup_irq(struct gpio_desc *desc, struct device *dev,
 
 	irq_flags = IRQF_SHARED;
 	if (test_bit(FLAG_TRIG_FALL, &gpio_flags))
-		irq_flags |= IRQF_TRIGGER_FALLING;
+		irq_flags |= test_bit(FLAG_ACTIVE_LOW, &desc->flags) ?
+			IRQF_TRIGGER_RISING : IRQF_TRIGGER_FALLING;
 	if (test_bit(FLAG_TRIG_RISE, &gpio_flags))
-		irq_flags |= IRQF_TRIGGER_RISING;
+		irq_flags |= test_bit(FLAG_ACTIVE_LOW, &desc->flags) ?
+			IRQF_TRIGGER_FALLING : IRQF_TRIGGER_RISING;
 
 	if (!pdesc) {
 		pdesc = kmalloc(sizeof(*pdesc), GFP_KERNEL);
@@ -475,9 +492,79 @@ found:
 
 static DEVICE_ATTR(edge, 0644, gpio_edge_show, gpio_edge_store);
 
+static int sysfs_set_active_low(struct gpio_desc *desc, struct device *dev,
+				int value)
+{
+	int			status = 0;
+
+	if (!!test_bit(FLAG_ACTIVE_LOW, &desc->flags) == !!value)
+		return 0;
+
+	if (value)
+		set_bit(FLAG_ACTIVE_LOW, &desc->flags);
+	else
+		clear_bit(FLAG_ACTIVE_LOW, &desc->flags);
+
+	/* reconfigure poll(2) support if enabled on one edge only */
+	if (dev != NULL && (!!test_bit(FLAG_TRIG_RISE, &desc->flags) ^
+				!!test_bit(FLAG_TRIG_FALL, &desc->flags))) {
+		unsigned long trigger_flags = desc->flags & GPIO_TRIGGER_MASK;
+
+		gpio_setup_irq(desc, dev, 0);
+		status = gpio_setup_irq(desc, dev, trigger_flags);
+	}
+
+	return status;
+}
+
+static ssize_t gpio_active_low_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	const struct gpio_desc	*desc = dev_get_drvdata(dev);
+	ssize_t			status;
+
+	mutex_lock(&sysfs_lock);
+
+	if (!test_bit(FLAG_EXPORT, &desc->flags))
+		status = -EIO;
+	else
+		status = sprintf(buf, "%d\n",
+				!!test_bit(FLAG_ACTIVE_LOW, &desc->flags));
+
+	mutex_unlock(&sysfs_lock);
+
+	return status;
+}
+
+static ssize_t gpio_active_low_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	struct gpio_desc	*desc = dev_get_drvdata(dev);
+	ssize_t			status;
+
+	mutex_lock(&sysfs_lock);
+
+	if (!test_bit(FLAG_EXPORT, &desc->flags)) {
+		status = -EIO;
+	} else {
+		long		value;
+
+		status = strict_strtol(buf, 0, &value);
+		if (status == 0)
+			status = sysfs_set_active_low(desc, dev, value != 0);
+	}
+
+	mutex_unlock(&sysfs_lock);
+
+	return status ? : size;
+}
+
+static const DEVICE_ATTR(active_low, 0644,
+		gpio_active_low_show, gpio_active_low_store);
+
 static const struct attribute *gpio_attrs[] = {
-	&dev_attr_direction.attr,
 	&dev_attr_value.attr,
+	&dev_attr_active_low.attr,
 	NULL,
 };
 
@@ -662,12 +749,12 @@ int gpio_export(unsigned gpio, bool direction_may_change)
 		dev = device_create(&gpio_class, desc->chip->dev, MKDEV(0, 0),
 				desc, ioname ? ioname : "gpio%d", gpio);
 		if (!IS_ERR(dev)) {
-			if (direction_may_change)
-				status = sysfs_create_group(&dev->kobj,
+			status = sysfs_create_group(&dev->kobj,
 						&gpio_attr_group);
-			else
+
+			if (!status && direction_may_change)
 				status = device_create_file(dev,
-						&dev_attr_value);
+						&dev_attr_direction);
 
 			if (!status && gpio_to_irq(gpio) >= 0
 					&& (direction_may_change
@@ -744,6 +831,55 @@ done:
 }
 EXPORT_SYMBOL_GPL(gpio_export_link);
 
+
+/**
+ * gpio_sysfs_set_active_low - set the polarity of gpio sysfs value
+ * @gpio: gpio to change
+ * @value: non-zero to use active low, i.e. inverted values
+ *
+ * Set the polarity of /sys/class/gpio/gpioN/value sysfs attribute.
+ * The GPIO does not have to be exported yet.  If poll(2) support has
+ * been enabled for either rising or falling edge, it will be
+ * reconfigured to follow the new polarity.
+ *
+ * Returns zero on success, else an error.
+ */
+int gpio_sysfs_set_active_low(unsigned gpio, int value)
+{
+	struct gpio_desc	*desc;
+	struct device		*dev = NULL;
+	int			status = -EINVAL;
+
+	if (!gpio_is_valid(gpio))
+		goto done;
+
+	mutex_lock(&sysfs_lock);
+
+	desc = &gpio_desc[gpio];
+
+	if (test_bit(FLAG_EXPORT, &desc->flags)) {
+		struct device *dev;
+
+		dev = class_find_device(&gpio_class, NULL, desc, match_export);
+		if (dev == NULL) {
+			status = -ENODEV;
+			goto unlock;
+		}
+	}
+
+	status = sysfs_set_active_low(desc, dev, value);
+
+unlock:
+	mutex_unlock(&sysfs_lock);
+
+done:
+	if (status)
+		pr_debug("%s: gpio%d status %d\n", __func__, gpio, status);
+
+	return status;
+}
+EXPORT_SYMBOL_GPL(gpio_sysfs_set_active_low);
+
 /**
  * gpio_unexport - reverse effect of gpio_export()
  * @gpio: gpio to make unavailable
@@ -1094,6 +1230,7 @@ void gpio_free(unsigned gpio)
 		}
 		desc_set_label(desc, NULL);
 		module_put(desc->chip->owner);
+		clear_bit(FLAG_ACTIVE_LOW, &desc->flags);
 		clear_bit(FLAG_REQUESTED, &desc->flags);
 	} else
 		WARN_ON(extra_checks);
diff --git a/include/asm-generic/gpio.h b/include/asm-generic/gpio.h
index 204bed37e82d..485eeb6c4ef3 100644
--- a/include/asm-generic/gpio.h
+++ b/include/asm-generic/gpio.h
@@ -145,6 +145,7 @@ extern int __gpio_to_irq(unsigned gpio);
 extern int gpio_export(unsigned gpio, bool direction_may_change);
 extern int gpio_export_link(struct device *dev, const char *name,
 			unsigned gpio);
+extern int gpio_sysfs_set_active_low(unsigned gpio, int value);
 extern void gpio_unexport(unsigned gpio);
 
 #endif	/* CONFIG_GPIO_SYSFS */
@@ -197,6 +198,11 @@ static inline int gpio_export_link(struct device *dev, const char *name,
 	return -ENOSYS;
 }
 
+static inline int gpio_sysfs_set_active_low(unsigned gpio, int value)
+{
+	return -ENOSYS;
+}
+
 static inline void gpio_unexport(unsigned gpio)
 {
 }
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 059bd189d35d..4e949a5b5b85 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -99,6 +99,12 @@ static inline int gpio_export_link(struct device *dev, const char *name,
 	return -EINVAL;
 }
 
+static inline int gpio_sysfs_set_active_low(unsigned gpio, int value)
+{
+	/* GPIO can never have been requested */
+	WARN_ON(1);
+	return -EINVAL;
+}
 
 static inline void gpio_unexport(unsigned gpio)
 {
-- 
cgit v1.2.3


From 9265576daeab1a884b11cc4c1087b72b488ca2e3 Mon Sep 17 00:00:00 2001
From: Vincent Sanders <vince@simtec.co.uk>
Date: Tue, 15 Dec 2009 16:46:35 -0800
Subject: sm501: implement acceleration features

This patch provides the acceleration entry points for the SM501
framebuffer driver.

This patch provides the sync, copyarea and fillrect entry points, using
the SM501's 2D acceleration engine to perform the operations in-chip
rather than across the bus.

Signed-off-by: Ben Dooks <ben@simtec.co.uk>
Signed-off-by: Simtec Linux Team <linux@simtec.co.uk>
Signed-off-by: Vincent Sanders <vince@simtec.co.uk>
Cc: Krzysztof Helt <krzysztof.h1@poczta.fm>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/sm501fb.c    | 237 ++++++++++++++++++++++++++++++++++++++++++---
 include/linux/sm501-regs.h |   2 +
 2 files changed, 226 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/sm501fb.c b/drivers/video/sm501fb.c
index d1c8ea83b41f..35370d0ecf03 100644
--- a/drivers/video/sm501fb.c
+++ b/drivers/video/sm501fb.c
@@ -66,6 +66,7 @@ struct sm501fb_info {
 	struct fb_info		*fb[2];		/* fb info for both heads */
 	struct resource		*fbmem_res;	/* framebuffer resource */
 	struct resource		*regs_res;	/* registers resource */
+	struct resource		*regs2d_res;	/* 2d registers resource */
 	struct sm501_platdata_fb *pdata;	/* our platform data */
 
 	unsigned long		 pm_crt_ctrl;	/* pm: crt ctrl save */
@@ -73,6 +74,7 @@ struct sm501fb_info {
 	int			 irq;
 	int			 swap_endian;	/* set to swap rgb=>bgr */
 	void __iomem		*regs;		/* remapped registers */
+	void __iomem		*regs2d;	/* 2d remapped registers */
 	void __iomem		*fbmem;		/* remapped framebuffer */
 	size_t			 fbmem_len;	/* length of remapped region */
 };
@@ -123,9 +125,9 @@ static inline void sm501fb_sync_regs(struct sm501fb_info *info)
  * This is an attempt to lay out memory for the two framebuffers and
  * everything else
  *
- * |fbmem_res->start	                                       fbmem_res->end|
- * |                                                                         |
- * |fb[0].fix.smem_start    |         |fb[1].fix.smem_start    |     2K      |
+ * |fbmem_res->start					       fbmem_res->end|
+ * |									     |
+ * |fb[0].fix.smem_start    |	      |fb[1].fix.smem_start    |     2K	     |
  * |-> fb[0].fix.smem_len <-| spare   |-> fb[1].fix.smem_len <-|-> cursors <-|
  *
  * The "spare" space is for the 2d engine data
@@ -1246,7 +1248,173 @@ static ssize_t sm501fb_debug_show_pnl(struct device *dev,
 
 static DEVICE_ATTR(fbregs_pnl, 0444, sm501fb_debug_show_pnl, NULL);
 
-/* framebuffer ops */
+/* acceleration operations */
+static int sm501fb_sync(struct fb_info *info)
+{
+	int count = 1000000;
+	struct sm501fb_par  *par = info->par;
+	struct sm501fb_info *fbi = par->info;
+
+	/* wait for the 2d engine to be ready */
+	while ((count > 0) &&
+	       (readl(fbi->regs + SM501_SYSTEM_CONTROL) &
+		SM501_SYSCTRL_2D_ENGINE_STATUS) != 0)
+		count--;
+
+	if (count <= 0) {
+		dev_err(info->dev, "Timeout waiting for 2d engine sync\n");
+		return 1;
+	}
+	return 0;
+}
+
+static void sm501fb_copyarea(struct fb_info *info, const struct fb_copyarea *area)
+{
+	struct sm501fb_par  *par = info->par;
+	struct sm501fb_info *fbi = par->info;
+	int width = area->width;
+	int height = area->height;
+	int sx = area->sx;
+	int sy = area->sy;
+	int dx = area->dx;
+	int dy = area->dy;
+	unsigned long rtl = 0;
+
+	/* source clip */
+	if ((sx >= info->var.xres_virtual) ||
+	    (sy >= info->var.yres_virtual))
+		/* source Area not within virtual screen, skipping */
+		return;
+	if ((sx + width) >= info->var.xres_virtual)
+		width = info->var.xres_virtual - sx - 1;
+	if ((sy + height) >= info->var.yres_virtual)
+		height = info->var.yres_virtual - sy - 1;
+
+	/* dest clip */
+	if ((dx >= info->var.xres_virtual) ||
+	    (dy >= info->var.yres_virtual))
+		/* Destination Area not within virtual screen, skipping */
+		return;
+	if ((dx + width) >= info->var.xres_virtual)
+		width = info->var.xres_virtual - dx - 1;
+	if ((dy + height) >= info->var.yres_virtual)
+		height = info->var.yres_virtual - dy - 1;
+
+	if ((sx < dx) || (sy < dy)) {
+		rtl = 1 << 27;
+		sx += width - 1;
+		dx += width - 1;
+		sy += height - 1;
+		dy += height - 1;
+	}
+
+	if (sm501fb_sync(info))
+		return;
+
+	/* set the base addresses */
+	writel(par->screen.sm_addr, fbi->regs2d + SM501_2D_SOURCE_BASE);
+	writel(par->screen.sm_addr, fbi->regs2d + SM501_2D_DESTINATION_BASE);
+
+	/* set the window width */
+	writel((info->var.xres << 16) | info->var.xres,
+	       fbi->regs2d + SM501_2D_WINDOW_WIDTH);
+
+	/* set window stride */
+	writel((info->var.xres_virtual << 16) | info->var.xres_virtual,
+	       fbi->regs2d + SM501_2D_PITCH);
+
+	/* set data format */
+	switch (info->var.bits_per_pixel) {
+	case 8:
+		writel(0, fbi->regs2d + SM501_2D_STRETCH);
+		break;
+	case 16:
+		writel(0x00100000, fbi->regs2d + SM501_2D_STRETCH);
+		break;
+	case 32:
+		writel(0x00200000, fbi->regs2d + SM501_2D_STRETCH);
+		break;
+	}
+
+	/* 2d compare mask */
+	writel(0xffffffff, fbi->regs2d + SM501_2D_COLOR_COMPARE_MASK);
+
+	/* 2d mask */
+	writel(0xffffffff, fbi->regs2d + SM501_2D_MASK);
+
+	/* source and destination x y */
+	writel((sx << 16) | sy, fbi->regs2d + SM501_2D_SOURCE);
+	writel((dx << 16) | dy, fbi->regs2d + SM501_2D_DESTINATION);
+
+	/* w/h */
+	writel((width << 16) | height, fbi->regs2d + SM501_2D_DIMENSION);
+
+	/* do area move */
+	writel(0x800000cc | rtl, fbi->regs2d + SM501_2D_CONTROL);
+}
+
+static void sm501fb_fillrect(struct fb_info *info, const struct fb_fillrect *rect)
+{
+	struct sm501fb_par  *par = info->par;
+	struct sm501fb_info *fbi = par->info;
+	int width = rect->width, height = rect->height;
+
+	if ((rect->dx >= info->var.xres_virtual) ||
+	    (rect->dy >= info->var.yres_virtual))
+		/* Rectangle not within virtual screen, skipping */
+		return;
+	if ((rect->dx + width) >= info->var.xres_virtual)
+		width = info->var.xres_virtual - rect->dx - 1;
+	if ((rect->dy + height) >= info->var.yres_virtual)
+		height = info->var.yres_virtual - rect->dy - 1;
+
+	if (sm501fb_sync(info))
+		return;
+
+	/* set the base addresses */
+	writel(par->screen.sm_addr, fbi->regs2d + SM501_2D_SOURCE_BASE);
+	writel(par->screen.sm_addr, fbi->regs2d + SM501_2D_DESTINATION_BASE);
+
+	/* set the window width */
+	writel((info->var.xres << 16) | info->var.xres,
+	       fbi->regs2d + SM501_2D_WINDOW_WIDTH);
+
+	/* set window stride */
+	writel((info->var.xres_virtual << 16) | info->var.xres_virtual,
+	       fbi->regs2d + SM501_2D_PITCH);
+
+	/* set data format */
+	switch (info->var.bits_per_pixel) {
+	case 8:
+		writel(0, fbi->regs2d + SM501_2D_STRETCH);
+		break;
+	case 16:
+		writel(0x00100000, fbi->regs2d + SM501_2D_STRETCH);
+		break;
+	case 32:
+		writel(0x00200000, fbi->regs2d + SM501_2D_STRETCH);
+		break;
+	}
+
+	/* 2d compare mask */
+	writel(0xffffffff, fbi->regs2d + SM501_2D_COLOR_COMPARE_MASK);
+
+	/* 2d mask */
+	writel(0xffffffff, fbi->regs2d + SM501_2D_MASK);
+
+	/* colour */
+	writel(rect->color, fbi->regs2d + SM501_2D_FOREGROUND);
+
+	/* x y */
+	writel((rect->dx << 16) | rect->dy, fbi->regs2d + SM501_2D_DESTINATION);
+
+	/* w/h */
+	writel((width << 16) | height, fbi->regs2d + SM501_2D_DIMENSION);
+
+	/* do rectangle fill */
+	writel(0x800100cc, fbi->regs2d + SM501_2D_CONTROL);
+}
+
 
 static struct fb_ops sm501fb_ops_crt = {
 	.owner		= THIS_MODULE,
@@ -1256,9 +1424,10 @@ static struct fb_ops sm501fb_ops_crt = {
 	.fb_setcolreg	= sm501fb_setcolreg,
 	.fb_pan_display	= sm501fb_pan_crt,
 	.fb_cursor	= sm501fb_cursor,
-	.fb_fillrect	= cfb_fillrect,
-	.fb_copyarea	= cfb_copyarea,
+	.fb_fillrect	= sm501fb_fillrect,
+	.fb_copyarea	= sm501fb_copyarea,
 	.fb_imageblit	= cfb_imageblit,
+	.fb_sync	= sm501fb_sync,
 };
 
 static struct fb_ops sm501fb_ops_pnl = {
@@ -1269,9 +1438,10 @@ static struct fb_ops sm501fb_ops_pnl = {
 	.fb_blank	= sm501fb_blank_pnl,
 	.fb_setcolreg	= sm501fb_setcolreg,
 	.fb_cursor	= sm501fb_cursor,
-	.fb_fillrect	= cfb_fillrect,
-	.fb_copyarea	= cfb_copyarea,
+	.fb_fillrect	= sm501fb_fillrect,
+	.fb_copyarea	= sm501fb_copyarea,
 	.fb_imageblit	= cfb_imageblit,
+	.fb_sync	= sm501fb_sync,
 };
 
 /* sm501_init_cursor
@@ -1329,7 +1499,8 @@ static int sm501fb_start(struct sm501fb_info *info,
 		dev_warn(dev, "no irq for device\n");
 	}
 
-	/* allocate, reserve and remap resources for registers */
+	/* allocate, reserve and remap resources for display
+	 * controller registers */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (res == NULL) {
 		dev_err(dev, "no resource definition for registers\n");
@@ -1354,12 +1525,38 @@ static int sm501fb_start(struct sm501fb_info *info,
 		goto err_regs_res;
 	}
 
+	/* allocate, reserve and remap resources for 2d
+	 * controller registers */
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	if (res == NULL) {
+		dev_err(dev, "no resource definition for 2d registers\n");
+		ret = -ENOENT;
+		goto err_regs_map;
+	}
+
+	info->regs2d_res = request_mem_region(res->start,
+					      resource_size(res),
+					      pdev->name);
+
+	if (info->regs2d_res == NULL) {
+		dev_err(dev, "cannot claim registers\n");
+		ret = -ENXIO;
+		goto err_regs_map;
+	}
+
+	info->regs2d = ioremap(res->start, resource_size(res));
+	if (info->regs2d == NULL) {
+		dev_err(dev, "cannot remap registers\n");
+		ret = -ENXIO;
+		goto err_regs2d_res;
+	}
+
 	/* allocate, reserve resources for framebuffer */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 2);
 	if (res == NULL) {
 		dev_err(dev, "no memory resource defined\n");
 		ret = -ENXIO;
-		goto err_regs_map;
+		goto err_regs2d_map;
 	}
 
 	info->fbmem_res = request_mem_region(res->start,
@@ -1368,7 +1565,7 @@ static int sm501fb_start(struct sm501fb_info *info,
 	if (info->fbmem_res == NULL) {
 		dev_err(dev, "cannot claim framebuffer\n");
 		ret = -ENXIO;
-		goto err_regs_map;
+		goto err_regs2d_map;
 	}
 
 	info->fbmem = ioremap(res->start, resource_size(res));
@@ -1389,8 +1586,10 @@ static int sm501fb_start(struct sm501fb_info *info,
 	/* enable display controller */
 	sm501_unit_power(dev->parent, SM501_GATE_DISPLAY, 1);
 
-	/* setup cursors */
+	/* enable 2d controller */
+	sm501_unit_power(dev->parent, SM501_GATE_2D_ENGINE, 1);
 
+	/* setup cursors */
 	sm501_init_cursor(info->fb[HEAD_CRT], SM501_DC_CRT_HWC_ADDR);
 	sm501_init_cursor(info->fb[HEAD_PANEL], SM501_DC_PANEL_HWC_ADDR);
 
@@ -1400,6 +1599,13 @@ static int sm501fb_start(struct sm501fb_info *info,
 	release_resource(info->fbmem_res);
 	kfree(info->fbmem_res);
 
+ err_regs2d_map:
+	iounmap(info->regs2d);
+
+ err_regs2d_res:
+	release_resource(info->regs2d_res);
+	kfree(info->regs2d_res);
+
  err_regs_map:
 	iounmap(info->regs);
 
@@ -1420,6 +1626,10 @@ static void sm501fb_stop(struct sm501fb_info *info)
 	release_resource(info->fbmem_res);
 	kfree(info->fbmem_res);
 
+	iounmap(info->regs2d);
+	release_resource(info->regs2d_res);
+	kfree(info->regs2d_res);
+
 	iounmap(info->regs);
 	release_resource(info->regs_res);
 	kfree(info->regs_res);
@@ -1486,7 +1696,8 @@ static int sm501fb_init_fb(struct fb_info *fb,
 		par->ops.fb_cursor = NULL;
 
 	fb->fbops = &par->ops;
-	fb->flags = FBINFO_FLAG_DEFAULT |
+	fb->flags = FBINFO_FLAG_DEFAULT | FBINFO_READS_FAST |
+		FBINFO_HWACCEL_COPYAREA | FBINFO_HWACCEL_FILLRECT |
 		FBINFO_HWACCEL_XPAN | FBINFO_HWACCEL_YPAN;
 
 	/* fixed data */
diff --git a/include/linux/sm501-regs.h b/include/linux/sm501-regs.h
index d53642d2d899..67ed2c542831 100644
--- a/include/linux/sm501-regs.h
+++ b/include/linux/sm501-regs.h
@@ -31,6 +31,8 @@
 #define SM501_SYSCTRL_PCI_SUBSYS_LOCK	(1<<11)
 #define SM501_SYSCTRL_PCI_BURST_READ_EN	(1<<15)
 
+#define SM501_SYSCTRL_2D_ENGINE_STATUS	(1<<19)
+
 /* miscellaneous control */
 
 #define SM501_MISC_CONTROL		(0x000004)
-- 
cgit v1.2.3


From 904e812931f001b984912b2d2f653ea69520313c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 15 Dec 2009 16:46:52 -0800
Subject: reiserfs: remove /proc/fs/reiserfs/version

/proc/fs/reiserfs/version is on the way of removing ->read_proc interface.
 It's empty however, so simply remove it instead of doing dummy
conversion.  It's hard to see what information userspace can extract from
empty file.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/procfs.c        | 39 ---------------------------------------
 fs/reiserfs/super.c         |  4 ----
 include/linux/reiserfs_fs.h |  5 -----
 3 files changed, 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 9229e5514a4e..0ebb11646526 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -48,14 +48,6 @@ static int show_version(struct seq_file *m, struct super_block *sb)
 	return 0;
 }
 
-int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset,
-				    int count, int *eof, void *data)
-{
-	*start = buffer;
-	*eof = 1;
-	return 0;
-}
-
 #define SF( x ) ( r -> x )
 #define SFP( x ) SF( s_proc_info_data.x )
 #define SFPL( x ) SFP( x[ level ] )
@@ -538,19 +530,6 @@ int reiserfs_proc_info_done(struct super_block *sb)
 	return 0;
 }
 
-struct proc_dir_entry *reiserfs_proc_register_global(char *name,
-						     read_proc_t * func)
-{
-	return (proc_info_root) ? create_proc_read_entry(name, 0,
-							 proc_info_root,
-							 func, NULL) : NULL;
-}
-
-void reiserfs_proc_unregister_global(const char *name)
-{
-	remove_proc_entry(name, proc_info_root);
-}
-
 int reiserfs_proc_info_global_init(void)
 {
 	if (proc_info_root == NULL) {
@@ -585,16 +564,6 @@ int reiserfs_proc_info_done(struct super_block *sb)
 	return 0;
 }
 
-struct proc_dir_entry *reiserfs_proc_register_global(char *name,
-						     read_proc_t * func)
-{
-	return NULL;
-}
-
-void reiserfs_proc_unregister_global(const char *name)
-{;
-}
-
 int reiserfs_proc_info_global_init(void)
 {
 	return 0;
@@ -603,14 +572,6 @@ int reiserfs_proc_info_global_done(void)
 {
 	return 0;
 }
-
-int reiserfs_global_version_in_proc(char *buffer, char **start,
-				    off_t offset,
-				    int count, int *eof, void *data)
-{
-	return 0;
-}
-
 /* REISERFS_PROC_INFO */
 #endif
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 339b0baf2af6..b4a7dd03bdb9 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2222,8 +2222,6 @@ static int __init init_reiserfs_fs(void)
 	}
 
 	reiserfs_proc_info_global_init();
-	reiserfs_proc_register_global("version",
-				      reiserfs_global_version_in_proc);
 
 	ret = register_filesystem(&reiserfs_fs_type);
 
@@ -2231,7 +2229,6 @@ static int __init init_reiserfs_fs(void)
 		return 0;
 	}
 
-	reiserfs_proc_unregister_global("version");
 	reiserfs_proc_info_global_done();
 	destroy_inodecache();
 
@@ -2240,7 +2237,6 @@ static int __init init_reiserfs_fs(void)
 
 static void __exit exit_reiserfs_fs(void)
 {
-	reiserfs_proc_unregister_global("version");
 	reiserfs_proc_info_global_done();
 	unregister_filesystem(&reiserfs_fs_type);
 	destroy_inodecache();
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index a05b4a20768d..f65ed0d3a920 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -2061,13 +2061,8 @@ struct dentry *reiserfs_get_parent(struct dentry *);
 
 int reiserfs_proc_info_init(struct super_block *sb);
 int reiserfs_proc_info_done(struct super_block *sb);
-struct proc_dir_entry *reiserfs_proc_register_global(char *name,
-						     read_proc_t * func);
-void reiserfs_proc_unregister_global(const char *name);
 int reiserfs_proc_info_global_init(void);
 int reiserfs_proc_info_global_done(void);
-int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset,
-				    int count, int *eof, void *data);
 
 #if defined( REISERFS_PROC_INFO )
 
-- 
cgit v1.2.3


From e3c96f53ac132743fda1384910feb863a2eab916 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 15 Dec 2009 16:46:54 -0800
Subject: reiserfs: don't compile procfs.o at all if no support

* small define cleanup in header
* fix #ifdeffery in procfs.c via Kconfig

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/Makefile        |  6 +++++-
 fs/reiserfs/procfs.c        | 26 --------------------------
 include/linux/reiserfs_fs.h | 30 +++++++++++++++++++++---------
 3 files changed, 26 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 6a9e30c041dd..792b3cb2cd18 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -7,7 +7,11 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
 reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
 		 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
 		 hashes.o tail_conversion.o journal.o resize.o \
-		 item_ops.o ioctl.o procfs.o xattr.o lock.o
+		 item_ops.o ioctl.o xattr.o lock.o
+
+ifeq ($(CONFIG_REISERFS_PROC_INFO),y)
+reiserfs-objs += procfs.o
+endif
 
 ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
 reiserfs-objs += xattr_user.o xattr_trusted.o
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 0ebb11646526..7a9981196c1c 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -17,8 +17,6 @@
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 
-#ifdef CONFIG_REISERFS_PROC_INFO
-
 /*
  * LOCKING:
  *
@@ -551,30 +549,6 @@ int reiserfs_proc_info_global_done(void)
 	}
 	return 0;
 }
-
-/* REISERFS_PROC_INFO */
-#else
-
-int reiserfs_proc_info_init(struct super_block *sb)
-{
-	return 0;
-}
-int reiserfs_proc_info_done(struct super_block *sb)
-{
-	return 0;
-}
-
-int reiserfs_proc_info_global_init(void)
-{
-	return 0;
-}
-int reiserfs_proc_info_global_done(void)
-{
-	return 0;
-}
-/* REISERFS_PROC_INFO */
-#endif
-
 /*
  * Revision 1.1.8.2  2001/07/15 17:08:42  god
  *  . use get_super() in procfs.c
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index f65ed0d3a920..c96c1858fe2c 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -2051,21 +2051,13 @@ void set_de_name_and_namelen(struct reiserfs_dir_entry *de);
 int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
 			struct treepath *path, struct reiserfs_dir_entry *de);
 struct dentry *reiserfs_get_parent(struct dentry *);
-/* procfs.c */
-
-#if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO )
-#define REISERFS_PROC_INFO
-#else
-#undef REISERFS_PROC_INFO
-#endif
 
+#ifdef CONFIG_REISERFS_PROC_INFO
 int reiserfs_proc_info_init(struct super_block *sb);
 int reiserfs_proc_info_done(struct super_block *sb);
 int reiserfs_proc_info_global_init(void);
 int reiserfs_proc_info_global_done(void);
 
-#if defined( REISERFS_PROC_INFO )
-
 #define PROC_EXP( e )   e
 
 #define __PINFO( sb ) REISERFS_SB(sb) -> s_proc_info_data
@@ -2079,6 +2071,26 @@ int reiserfs_proc_info_global_done(void);
     PROC_INFO_ADD( sb, free_at[ ( level ) ], B_FREE_SPACE( bh ) );	\
     PROC_INFO_ADD( sb, items_at[ ( level ) ], B_NR_ITEMS( bh ) )
 #else
+static inline int reiserfs_proc_info_init(struct super_block *sb)
+{
+	return 0;
+}
+
+static inline int reiserfs_proc_info_done(struct super_block *sb)
+{
+	return 0;
+}
+
+static inline int reiserfs_proc_info_global_init(void)
+{
+	return 0;
+}
+
+static inline int reiserfs_proc_info_global_done(void)
+{
+	return 0;
+}
+
 #define PROC_EXP( e )
 #define VOID_V ( ( void ) 0 )
 #define PROC_INFO_MAX( sb, field, value ) VOID_V
-- 
cgit v1.2.3


From 569b846df54ffb2827b83ce3244c5f032394cba4 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 15 Dec 2009 16:47:03 -0800
Subject: memcg: coalesce uncharge during unmap/truncate

In massive parallel enviroment, res_counter can be a performance
bottleneck.  One strong techinque to reduce lock contention is reducing
calls by coalescing some amount of calls into one.

Considering charge/uncharge chatacteristic,
	- charge is done one by one via demand-paging.
	- uncharge is done by
		- in chunk at munmap, truncate, exit, execve...
		- one by one via vmscan/paging.

It seems we have a chance to coalesce uncharges for improving scalability
at unmap/truncation.

This patch is a for coalescing uncharge.  For avoiding scattering memcg's
structure to functions under /mm, this patch adds memcg batch uncharge
information to the task.  A reason for per-task batching is for making use
of caller's context information.  We do batched uncharge (deleyed
uncharge) when truncation/unmap occurs but do direct uncharge when
uncharge is called by memory reclaim (vmscan.c).

The degree of coalescing depends on callers
  - at invalidate/trucate... pagevec size
  - at unmap ....ZAP_BLOCK_SIZE
(memory itself will be freed in this degree.)
Then, we'll not coalescing too much.

On x86-64 8cpu server, I tested overheads of memcg at page fault by
running a program which does map/fault/unmap in a loop. Running
a task per a cpu by taskset and see sum of the number of page faults
in 60secs.

[without memcg config]
  40156968  page-faults              #      0.085 M/sec   ( +-   0.046% )
  27.67 cache-miss/faults
[root cgroup]
  36659599  page-faults              #      0.077 M/sec   ( +-   0.247% )
  31.58 miss/faults
[in a child cgroup]
  18444157  page-faults              #      0.039 M/sec   ( +-   0.133% )
  69.96 miss/faults
[child with this patch]
  27133719  page-faults              #      0.057 M/sec   ( +-   0.155% )
  47.16 miss/faults

We can see some amounts of improvement.
(root cgroup doesn't affected by this patch)
Another patch for "charge" will follow this and above will be improved more.

Changelog(since 2009/10/02):
 - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes)
 - some clean up and commentary/description updates.
 - added initialize code to copy_process(). (possible bug fix)

Changelog(old):
 - fixed !CONFIG_MEM_CGROUP case.
 - rebased onto the latest mmotm + softlimit fix patches.
 - unified patch for callers
 - added commetns.
 - make ->do_batch as bool.
 - removed css_get() at el. We don't need it.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 13 +++++++
 include/linux/sched.h      |  8 ++++
 kernel/fork.c              |  4 ++
 mm/memcontrol.c            | 96 +++++++++++++++++++++++++++++++++++++++++++---
 mm/memory.c                |  2 +
 mm/truncate.c              |  6 +++
 6 files changed, 123 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bf9213b2db8f..91300c972e76 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -54,6 +54,11 @@ extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
 extern void mem_cgroup_del_lru(struct page *page);
 extern void mem_cgroup_move_lists(struct page *page,
 				  enum lru_list from, enum lru_list to);
+
+/* For coalescing uncharge for reducing memcg' overhead*/
+extern void mem_cgroup_uncharge_start(void);
+extern void mem_cgroup_uncharge_end(void);
+
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
 extern int mem_cgroup_shmem_charge_fallback(struct page *page,
@@ -151,6 +156,14 @@ static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr)
 {
 }
 
+static inline void mem_cgroup_uncharge_start(void)
+{
+}
+
+static inline void mem_cgroup_uncharge_end(void)
+{
+}
+
 static inline void mem_cgroup_uncharge_page(struct page *page)
 {
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c858f38e81a..f4c145410a8d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1544,6 +1544,14 @@ struct task_struct {
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
 	unsigned long stack_start;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
+	struct memcg_batch_info {
+		int do_batch;	/* incremented when batch uncharge started */
+		struct mem_cgroup *memcg; /* target memcg of uncharge */
+		unsigned long bytes; 		/* uncharged usage */
+		unsigned long memsw_bytes; /* uncharged mem+swap usage */
+	} memcg_batch;
+#endif
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 9bd91447e052..b6cbd33dde80 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1127,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+	p->memcg_batch.do_batch = 0;
+	p->memcg_batch.memcg = NULL;
+#endif
 
 	p->bts = NULL;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b5b108c1c6b..a730c91b8e69 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1827,6 +1827,50 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 	css_put(&mem->css);
 }
 
+static void
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+{
+	struct memcg_batch_info *batch = NULL;
+	bool uncharge_memsw = true;
+	/* If swapout, usage of swap doesn't decrease */
+	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+		uncharge_memsw = false;
+	/*
+	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
+	 * In those cases, all pages freed continously can be expected to be in
+	 * the same cgroup and we have chance to coalesce uncharges.
+	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
+	 * because we want to do uncharge as soon as possible.
+	 */
+	if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
+		goto direct_uncharge;
+
+	batch = &current->memcg_batch;
+	/*
+	 * In usual, we do css_get() when we remember memcg pointer.
+	 * But in this case, we keep res->usage until end of a series of
+	 * uncharges. Then, it's ok to ignore memcg's refcnt.
+	 */
+	if (!batch->memcg)
+		batch->memcg = mem;
+	/*
+	 * In typical case, batch->memcg == mem. This means we can
+	 * merge a series of uncharges to an uncharge of res_counter.
+	 * If not, we uncharge res_counter ony by one.
+	 */
+	if (batch->memcg != mem)
+		goto direct_uncharge;
+	/* remember freed charge and uncharge it later */
+	batch->bytes += PAGE_SIZE;
+	if (uncharge_memsw)
+		batch->memsw_bytes += PAGE_SIZE;
+	return;
+direct_uncharge:
+	res_counter_uncharge(&mem->res, PAGE_SIZE);
+	if (uncharge_memsw)
+		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+	return;
+}
 
 /*
  * uncharge if !page_mapped(page)
@@ -1875,12 +1919,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 		break;
 	}
 
-	if (!mem_cgroup_is_root(mem)) {
-		res_counter_uncharge(&mem->res, PAGE_SIZE);
-		if (do_swap_account &&
-				(ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
-			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
-	}
+	if (!mem_cgroup_is_root(mem))
+		__do_uncharge(mem, ctype);
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
 		mem_cgroup_swap_statistics(mem, true);
 	mem_cgroup_charge_statistics(mem, pc, false);
@@ -1926,6 +1966,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
 
+/*
+ * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
+ * In that cases, pages are freed continuously and we can expect pages
+ * are in the same memcg. All these calls itself limits the number of
+ * pages freed at once, then uncharge_start/end() is called properly.
+ * This may be called prural(2) times in a context,
+ */
+
+void mem_cgroup_uncharge_start(void)
+{
+	current->memcg_batch.do_batch++;
+	/* We can do nest. */
+	if (current->memcg_batch.do_batch == 1) {
+		current->memcg_batch.memcg = NULL;
+		current->memcg_batch.bytes = 0;
+		current->memcg_batch.memsw_bytes = 0;
+	}
+}
+
+void mem_cgroup_uncharge_end(void)
+{
+	struct memcg_batch_info *batch = &current->memcg_batch;
+
+	if (!batch->do_batch)
+		return;
+
+	batch->do_batch--;
+	if (batch->do_batch) /* If stacked, do nothing. */
+		return;
+
+	if (!batch->memcg)
+		return;
+	/*
+	 * This "batch->memcg" is valid without any css_get/put etc...
+	 * bacause we hide charges behind us.
+	 */
+	if (batch->bytes)
+		res_counter_uncharge(&batch->memcg->res, batch->bytes);
+	if (batch->memsw_bytes)
+		res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
+	/* forget this pointer (for sanity check) */
+	batch->memcg = NULL;
+}
+
 #ifdef CONFIG_SWAP
 /*
  * called after __delete_from_swap_cache() and drop "page" account.
diff --git a/mm/memory.c b/mm/memory.c
index a54b2c498444..aed45eaf8ac9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -956,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 		details = NULL;
 
 	BUG_ON(addr >= end);
+	mem_cgroup_uncharge_start();
 	tlb_start_vma(tlb, vma);
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
@@ -968,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 						zap_work, details);
 	} while (pgd++, addr = next, (addr != end && *zap_work > 0));
 	tlb_end_vma(tlb, vma);
+	mem_cgroup_uncharge_end();
 
 	return addr;
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index 2c147a7e5f2c..342deee22684 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			pagevec_release(&pvec);
 			break;
 		}
+		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			unlock_page(page);
 		}
 		pagevec_release(&pvec);
+		mem_cgroup_uncharge_end();
 	}
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 	pagevec_init(&pvec, 0);
 	while (next <= end &&
 			pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t index;
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 				break;
 		}
 		pagevec_release(&pvec);
+		mem_cgroup_uncharge_end();
 		cond_resched();
 	}
 	return ret;
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 	while (next <= end && !wrapped &&
 		pagevec_lookup(&pvec, mapping, next,
 			min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 			pgoff_t page_index;
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 			unlock_page(page);
 		}
 		pagevec_release(&pvec);
+		mem_cgroup_uncharge_end();
 		cond_resched();
 	}
 	return ret;
-- 
cgit v1.2.3


From d8046582d5ee24448800e71c6933fdb6813aa062 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 15 Dec 2009 16:47:09 -0800
Subject: memcg: make memcg's file mapped consistent with global VM

In global VM, FILE_MAPPED is used but memcg uses MAPPED_FILE.  This makes
grep difficult.  Replace memcg's MAPPED_FILE with FILE_MAPPED

And in global VM, mapped shared memory is accounted into FILE_MAPPED.
But memcg doesn't. fix it.
Note:
  page_is_file_cache() just checks SwapBacked or not.
  So, we need to check PageAnon.

Cc: Balbir Singh <balbir@in.ibm.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  4 ++--
 mm/memcontrol.c            | 21 +++++++++------------
 mm/rmap.c                  |  4 ++--
 3 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 91300c972e76..0b46c2068b96 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -122,7 +122,7 @@ static inline bool mem_cgroup_disabled(void)
 }
 
 extern bool mem_cgroup_oom_called(struct task_struct *task);
-void mem_cgroup_update_mapped_file_stat(struct page *page, int val);
+void mem_cgroup_update_file_mapped(struct page *page, int val);
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask, int nid,
 						int zid);
@@ -287,7 +287,7 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
-static inline void mem_cgroup_update_mapped_file_stat(struct page *page,
+static inline void mem_cgroup_update_file_mapped(struct page *page,
 							int val)
 {
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6587f657d57c..0b3efb843a87 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -67,7 +67,7 @@ enum mem_cgroup_stat_index {
 	 */
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
-	MEM_CGROUP_STAT_MAPPED_FILE,  /* # of pages charged as file rss */
+	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
 	MEM_CGROUP_STAT_EVENTS,	/* sum of pagein + pageout for internal use */
@@ -1227,7 +1227,7 @@ static void record_last_oom(struct mem_cgroup *mem)
  * Currently used to update mapped file statistics, but the routine can be
  * generalized to update other statistics as well.
  */
-void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
+void mem_cgroup_update_file_mapped(struct page *page, int val)
 {
 	struct mem_cgroup *mem;
 	struct mem_cgroup_stat *stat;
@@ -1235,9 +1235,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
 	int cpu;
 	struct page_cgroup *pc;
 
-	if (!page_is_file_cache(page))
-		return;
-
 	pc = lookup_page_cgroup(page);
 	if (unlikely(!pc))
 		return;
@@ -1257,7 +1254,7 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
 	stat = &mem->stat;
 	cpustat = &stat->cpustat[cpu];
 
-	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
+	__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
 done:
 	unlock_page_cgroup(pc);
 }
@@ -1654,18 +1651,18 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 	mem_cgroup_charge_statistics(from, pc, false);
 
 	page = pc->page;
-	if (page_is_file_cache(page) && page_mapped(page)) {
+	if (page_mapped(page) && !PageAnon(page)) {
 		cpu = smp_processor_id();
 		/* Update mapped_file data for mem_cgroup "from" */
 		stat = &from->stat;
 		cpustat = &stat->cpustat[cpu];
-		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
+		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
 						-1);
 
 		/* Update mapped_file data for mem_cgroup "to" */
 		stat = &to->stat;
 		cpustat = &stat->cpustat[cpu];
-		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
+		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
 						1);
 	}
 
@@ -2889,7 +2886,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 enum {
 	MCS_CACHE,
 	MCS_RSS,
-	MCS_MAPPED_FILE,
+	MCS_FILE_MAPPED,
 	MCS_PGPGIN,
 	MCS_PGPGOUT,
 	MCS_SWAP,
@@ -2933,8 +2930,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
 	s->stat[MCS_RSS] += val * PAGE_SIZE;
-	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE);
-	s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE;
+	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
+	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
 	s->stat[MCS_PGPGIN] += val;
 	val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
diff --git a/mm/rmap.c b/mm/rmap.c
index 98135dbd25ba..278cd277bdec 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -721,7 +721,7 @@ void page_add_file_rmap(struct page *page)
 {
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
-		mem_cgroup_update_mapped_file_stat(page, 1);
+		mem_cgroup_update_file_mapped(page, 1);
 	}
 }
 
@@ -753,8 +753,8 @@ void page_remove_rmap(struct page *page)
 		__dec_zone_page_state(page, NR_ANON_PAGES);
 	} else {
 		__dec_zone_page_state(page, NR_FILE_MAPPED);
+		mem_cgroup_update_file_mapped(page, -1);
 	}
-	mem_cgroup_update_mapped_file_stat(page, -1);
 	/*
 	 * It would be tidy to reset the PageAnon mapping here,
 	 * but that might overwrite a racing page_add_anon_rmap
-- 
cgit v1.2.3


From 57f9fd7d25ac9a0d7e3a4ced580e780ab4524e3b Mon Sep 17 00:00:00 2001
From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Date: Tue, 15 Dec 2009 16:47:11 -0800
Subject: memcg: cleanup mem_cgroup_move_parent()

mem_cgroup_move_parent() calls try_charge first and cancel_charge on
failure.  IMHO, charge/uncharge(especially charge) is high cost operation,
so we should avoid it as far as possible.

This patch tries to delay try_charge in mem_cgroup_move_parent() by
re-ordering checks it does.

And this patch renames mem_cgroup_move_account() to
__mem_cgroup_move_account(), changes the return value of
__mem_cgroup_move_account() from int to void, and adds a new
wrapper(mem_cgroup_move_account()), which checks whether a @pc is valid
for moving account and calls __mem_cgroup_move_account().

This patch removes the last caller of trylock_page_cgroup(), so removes
its definition too.

Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_cgroup.h |  7 ++--
 mm/memcontrol.c             | 84 +++++++++++++++++++--------------------------
 2 files changed, 37 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 4b938d4f3ac2..b0e4eb126236 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -57,6 +57,8 @@ static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
 static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)	\
 	{ return test_and_clear_bit(PCG_##lname, &pc->flags);  }
 
+TESTPCGFLAG(Locked, LOCK)
+
 /* Cache flag is set only once (at allocation) */
 TESTPCGFLAG(Cache, CACHE)
 CLEARPCGFLAG(Cache, CACHE)
@@ -86,11 +88,6 @@ static inline void lock_page_cgroup(struct page_cgroup *pc)
 	bit_spin_lock(PCG_LOCK, &pc->flags);
 }
 
-static inline int trylock_page_cgroup(struct page_cgroup *pc)
-{
-	return bit_spin_trylock(PCG_LOCK, &pc->flags);
-}
-
 static inline void unlock_page_cgroup(struct page_cgroup *pc)
 {
 	bit_spin_unlock(PCG_LOCK, &pc->flags);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d6b4a912a6d..6273984f2e34 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1613,27 +1613,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 }
 
 /**
- * mem_cgroup_move_account - move account of the page
+ * __mem_cgroup_move_account - move account of the page
  * @pc:	page_cgroup of the page.
  * @from: mem_cgroup which the page is moved from.
  * @to:	mem_cgroup which the page is moved to. @from != @to.
  *
  * The caller must confirm following.
  * - page is not on LRU (isolate_page() is useful.)
- *
- * returns 0 at success,
- * returns -EBUSY when lock is busy or "pc" is unstable.
+ * - the pc is locked, used, and ->mem_cgroup points to @from.
  *
  * This function does "uncharge" from old cgroup but doesn't do "charge" to
  * new cgroup. It should be done by a caller.
  */
 
-static int mem_cgroup_move_account(struct page_cgroup *pc,
+static void __mem_cgroup_move_account(struct page_cgroup *pc,
 	struct mem_cgroup *from, struct mem_cgroup *to)
 {
-	struct mem_cgroup_per_zone *from_mz, *to_mz;
-	int nid, zid;
-	int ret = -EBUSY;
 	struct page *page;
 	int cpu;
 	struct mem_cgroup_stat *stat;
@@ -1641,20 +1636,9 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(pc->page));
-
-	nid = page_cgroup_nid(pc);
-	zid = page_cgroup_zid(pc);
-	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
-	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
-
-	if (!trylock_page_cgroup(pc))
-		return ret;
-
-	if (!PageCgroupUsed(pc))
-		goto out;
-
-	if (pc->mem_cgroup != from)
-		goto out;
+	VM_BUG_ON(!PageCgroupLocked(pc));
+	VM_BUG_ON(!PageCgroupUsed(pc));
+	VM_BUG_ON(pc->mem_cgroup != from);
 
 	if (!mem_cgroup_is_root(from))
 		res_counter_uncharge(&from->res, PAGE_SIZE);
@@ -1683,15 +1667,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 	css_get(&to->css);
 	pc->mem_cgroup = to;
 	mem_cgroup_charge_statistics(to, pc, true);
-	ret = 0;
-out:
-	unlock_page_cgroup(pc);
 	/*
 	 * We charges against "to" which may not have any tasks. Then, "to"
 	 * can be under rmdir(). But in current implementation, caller of
 	 * this function is just force_empty() and it's garanteed that
 	 * "to" is never removed. So, we don't check rmdir status here.
 	 */
+}
+
+/*
+ * check whether the @pc is valid for moving account and call
+ * __mem_cgroup_move_account()
+ */
+static int mem_cgroup_move_account(struct page_cgroup *pc,
+				struct mem_cgroup *from, struct mem_cgroup *to)
+{
+	int ret = -EINVAL;
+	lock_page_cgroup(pc);
+	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+		__mem_cgroup_move_account(pc, from, to);
+		ret = 0;
+	}
+	unlock_page_cgroup(pc);
 	return ret;
 }
 
@@ -1713,38 +1710,27 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
 	if (!pcg)
 		return -EINVAL;
 
+	ret = -EBUSY;
+	if (!get_page_unless_zero(page))
+		goto out;
+	if (isolate_lru_page(page))
+		goto put;
 
 	parent = mem_cgroup_from_cont(pcg);
-
-
 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
 	if (ret || !parent)
-		return ret;
-
-	if (!get_page_unless_zero(page)) {
-		ret = -EBUSY;
-		goto uncharge;
-	}
-
-	ret = isolate_lru_page(page);
-
-	if (ret)
-		goto cancel;
+		goto put_back;
 
 	ret = mem_cgroup_move_account(pc, child, parent);
-
+	if (!ret)
+		css_put(&parent->css);	/* drop extra refcnt by try_charge() */
+	else
+		mem_cgroup_cancel_charge(parent);	/* does css_put */
+put_back:
 	putback_lru_page(page);
-	if (!ret) {
-		put_page(page);
-		/* drop extra refcnt by try_charge() */
-		css_put(&parent->css);
-		return 0;
-	}
-
-cancel:
+put:
 	put_page(page);
-uncharge:
-	mem_cgroup_cancel_charge(parent);
+out:
 	return ret;
 }
 
-- 
cgit v1.2.3


From c6a47cc2ccf9649ee09eeddd70a6d061bde69568 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 15 Dec 2009 16:47:15 -0800
Subject: ptrace: cleanup ptrace_init_task()->ptrace_link() path

No functional changes.

ptrace_init_task() looks confusing, as if we always auto-attach when "bool
ptrace" argument is true, while in fact we attach only if current is
traced.

Make the code more explicit and kill now unused ptrace_link().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 7456d7d87a19..1951805df63a 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -105,12 +105,7 @@ static inline int ptrace_reparented(struct task_struct *child)
 {
 	return child->real_parent != child->parent;
 }
-static inline void ptrace_link(struct task_struct *child,
-			       struct task_struct *new_parent)
-{
-	if (unlikely(child->ptrace))
-		__ptrace_link(child, new_parent);
-}
+
 static inline void ptrace_unlink(struct task_struct *child)
 {
 	if (unlikely(child->ptrace))
@@ -169,9 +164,9 @@ static inline void ptrace_init_task(struct task_struct *child, bool ptrace)
 	INIT_LIST_HEAD(&child->ptraced);
 	child->parent = child->real_parent;
 	child->ptrace = 0;
-	if (unlikely(ptrace)) {
+	if (unlikely(ptrace) && (current->ptrace & PT_PTRACED)) {
 		child->ptrace = current->ptrace;
-		ptrace_link(child, current->parent);
+		__ptrace_link(child, current->parent);
 	}
 }
 
-- 
cgit v1.2.3


From 85ec7fd9f8e528c4f61d595cfe4df7681a19f252 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 15 Dec 2009 16:47:17 -0800
Subject: ptrace: introduce user_single_step_siginfo() helper

Suggested by Roland.

Currently there is no way to synthesize a single-stepping trap in the
arch-independent manner.  This patch adds the default helper which fills
siginfo_t, arch/ can can override it.

Architetures which implement user_enable_single_step() should add
user_single_step_siginfo() also.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: <linux-arch@vger.kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 1951805df63a..56f2d63a5cbb 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -273,6 +273,18 @@ static inline void user_enable_block_step(struct task_struct *task)
 }
 #endif	/* arch_has_block_step */
 
+#ifdef ARCH_HAS_USER_SINGLE_STEP_INFO
+extern void user_single_step_siginfo(struct task_struct *tsk,
+				struct pt_regs *regs, siginfo_t *info);
+#else
+static inline void user_single_step_siginfo(struct task_struct *tsk,
+				struct pt_regs *regs, siginfo_t *info)
+{
+	memset(info, 0, sizeof(*info));
+	info->si_signo = SIGTRAP;
+}
+#endif
+
 #ifndef arch_ptrace_stop_needed
 /**
  * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called
-- 
cgit v1.2.3


From 2f0edac5555983dc28033acce8a355f588fd01b2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 15 Dec 2009 16:47:19 -0800
Subject: ptrace: change tracehook_report_syscall_exit() to handle stepping

Suggested by Roland.

Change tracehook_report_syscall_exit() to look at step flag and send the
trap signal if needed.

This change affects ia64, microblaze, parisc, powerpc, sh.  They pass
nonzero "step" argument to tracehook but since it was ignored the tracee
reports via ptrace_notify(), this is not right and not consistent.

	- PTRACE_SETSIGINFO doesn't work

	- if the tracer resumes the tracee with signr != 0 the new signal
	  is generated rather than delivering it

	- If PT_TRACESYSGOOD is set the tracee reports the wrong exit_code

I don't have a powerpc machine, but I think this test-case should see the
difference:

	#include <unistd.h>
	#include <sys/ptrace.h>
	#include <sys/wait.h>
	#include <assert.h>
	#include <stdio.h>

	int main(void)
	{
		int pid, status;

		if (!(pid = fork())) {
			assert(ptrace(PTRACE_TRACEME) == 0);
			kill(getpid(), SIGSTOP);

			getppid();

			return 0;
		}

		assert(pid == wait(&status));
		assert(ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_TRACESYSGOOD) == 0);

		assert(ptrace(PTRACE_SYSCALL, pid, 0,0) == 0);
		assert(pid == wait(&status));

		assert(ptrace(PTRACE_SINGLESTEP, pid, 0,0) == 0);
		assert(pid == wait(&status));

		if (status == 0x57F)
			return 0;

		printf("kernel bug: status=%X shouldn't have 0x80\n", status);
		return 1;
	}

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: <linux-arch@vger.kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 1eb44a924e56..10db0102a890 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -134,6 +134,13 @@ static inline __must_check int tracehook_report_syscall_entry(
  */
 static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 {
+	if (step) {
+		siginfo_t info;
+		user_single_step_siginfo(current, regs, &info);
+		force_sig_info(SIGTRAP, &info, current);
+		return;
+	}
+
 	ptrace_report_syscall(regs);
 }
 
-- 
cgit v1.2.3


From 614c517d7c00af1b26ded20646b329397d6f51a1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 15 Dec 2009 16:47:22 -0800
Subject: signals: SEND_SIG_NOINFO should be considered as SI_FROMUSER()

No changes in compiled code. The patch adds the new helper, si_fromuser()
and changes check_kill_permission() to use this helper.

The real effect of this patch is that from now we "officially" consider
SEND_SIG_NOINFO signal as "from user-space" signals. This is already true
if we look at the code which uses SEND_SIG_NOINFO, except __send_signal()
has another opinion - see the next patch.

The naming of these special SEND_SIG_XXX siginfo's is really bad
imho.  From __send_signal()'s pov they mean

	SEND_SIG_NOINFO		from user
	SEND_SIG_PRIV		from kernel
	SEND_SIG_FORCED		no info

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Reviewed-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  5 -----
 kernel/signal.c       | 16 +++++++++++++---
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f4c145410a8d..57b3516f055b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2102,11 +2102,6 @@ static inline int kill_cad_pid(int sig, int priv)
 #define SEND_SIG_PRIV	((struct siginfo *) 1)
 #define SEND_SIG_FORCED	((struct siginfo *) 2)
 
-static inline int is_si_special(const struct siginfo *info)
-{
-	return info <= SEND_SIG_FORCED;
-}
-
 /*
  * True if we are on the alternate signal stack.
  */
diff --git a/kernel/signal.c b/kernel/signal.c
index 6b982f2cf524..a0ba428954b6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -607,6 +607,17 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
 	return 1;
 }
 
+static inline int is_si_special(const struct siginfo *info)
+{
+	return info <= SEND_SIG_FORCED;
+}
+
+static inline bool si_fromuser(const struct siginfo *info)
+{
+	return info == SEND_SIG_NOINFO ||
+		(!is_si_special(info) && SI_FROMUSER(info));
+}
+
 /*
  * Bad permissions for sending the signal
  * - the caller must hold at least the RCU read lock
@@ -621,7 +632,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (!valid_signal(sig))
 		return -EINVAL;
 
-	if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
+	if (!si_fromuser(info))
 		return 0;
 
 	error = audit_signal_info(sig, t); /* Let audit system see the signal */
@@ -1186,8 +1197,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 		goto out_unlock;
 	}
 	pcred = __task_cred(p);
-	if ((info == SEND_SIG_NOINFO ||
-	     (!is_si_special(info) && SI_FROMUSER(info))) &&
+	if (si_fromuser(info) &&
 	    euid != pcred->suid && euid != pcred->uid &&
 	    uid  != pcred->suid && uid  != pcred->uid) {
 		ret = -EPERM;
-- 
cgit v1.2.3


From ad09750b51150ca87531b8790a379214a974c167 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 15 Dec 2009 16:47:25 -0800
Subject: signals: kill force_sig_specific()

Kill force_sig_specific(), this trivial wrapper has no callers.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 1 -
 kernel/signal.c       | 6 ------
 2 files changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 57b3516f055b..244c287a5ac1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2083,7 +2083,6 @@ extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern int do_notify_parent(struct task_struct *, int);
 extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
 extern void force_sig(int, struct task_struct *);
-extern void force_sig_specific(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
 extern void zap_other_threads(struct task_struct *p);
 extern struct sigqueue *sigqueue_alloc(void);
diff --git a/kernel/signal.c b/kernel/signal.c
index d7c7f3cd4da8..4a9d763f8922 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1062,12 +1062,6 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	return ret;
 }
 
-void
-force_sig_specific(int sig, struct task_struct *t)
-{
-	force_sig_info(sig, SEND_SIG_FORCED, t);
-}
-
 /*
  * Nuke all other threads in the group.
  */
-- 
cgit v1.2.3


From b97e820ffffbf49e94ed60c9c26f1a54bccae924 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Tue, 15 Dec 2009 16:47:32 -0800
Subject: ipc/sem.c: add a per-semaphore pending list

Based on Nick's findings:

sysv sem has the concept of semaphore arrays that consist out of multiple
semaphores.  Atomic operations that affect multiple semaphores are
supported.

The patch is the first step for optimizing simple, single semaphore
operations: In addition to the global list of all pending operations, a
2nd, per-semaphore list with the simple operations is added.

Note: this patch does not make sense by itself, the new list is used
nowhere.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Pierre Peiffer <peifferp@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sem.h |  5 ++++-
 ipc/sem.c           | 37 ++++++++++++++++++++++++++++++++-----
 2 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 1b191c176bcd..8a4adbef8a0f 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -86,6 +86,7 @@ struct task_struct;
 struct sem {
 	int	semval;		/* current value */
 	int	sempid;		/* pid of last operation */
+	struct list_head sem_pending; /* pending single-sop operations */
 };
 
 /* One sem_array data structure for each set of semaphores in the system. */
@@ -96,11 +97,13 @@ struct sem_array {
 	struct sem		*sem_base;	/* ptr to first semaphore in array */
 	struct list_head	sem_pending;	/* pending operations to be processed */
 	struct list_head	list_id;	/* undo requests on this array */
-	unsigned long		sem_nsems;	/* no. of semaphores in array */
+	int			sem_nsems;	/* no. of semaphores in array */
+	int			complex_count;	/* pending complex operations */
 };
 
 /* One queue for each sleeping process in the system. */
 struct sem_queue {
+	struct list_head	simple_list; /* queue of pending operations */
 	struct list_head	list;	 /* queue of pending operations */
 	struct task_struct	*sleeper; /* this process */
 	struct sem_undo		*undo;	 /* undo structure */
diff --git a/ipc/sem.c b/ipc/sem.c
index eac3f46a5968..4252a8eb77e2 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -241,6 +241,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	key_t key = params->key;
 	int nsems = params->u.nsems;
 	int semflg = params->flg;
+	int i;
 
 	if (!nsems)
 		return -EINVAL;
@@ -273,6 +274,11 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	ns->used_sems += nsems;
 
 	sma->sem_base = (struct sem *) &sma[1];
+
+	for (i = 0; i < nsems; i++)
+		INIT_LIST_HEAD(&sma->sem_base[i].sem_pending);
+
+	sma->complex_count = 0;
 	INIT_LIST_HEAD(&sma->sem_pending);
 	INIT_LIST_HEAD(&sma->list_id);
 	sma->sem_nsems = nsems;
@@ -419,6 +425,15 @@ static void wake_up_sem_queue(struct sem_queue *q, int error)
 	preempt_enable();
 }
 
+static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
+{
+	list_del(&q->list);
+	if (q->nsops == 1)
+		list_del(&q->simple_list);
+	else
+		sma->complex_count--;
+}
+
 /* Go through the pending queue for the indicated semaphore
  * looking for tasks that can be completed.
  */
@@ -438,7 +453,7 @@ again:
 		if (error > 0)
 			continue;
 
-		list_del(&q->list);
+		unlink_queue(sma, q);
 
 		/*
 		 * The next operation that must be checked depends on the type
@@ -532,8 +547,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 
 	/* Wake up all pending processes and let them fail with EIDRM. */
 	list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
-		list_del(&q->list);
-
+		unlink_queue(sma, q);
 		wake_up_sem_queue(q, -EIDRM);
 	}
 
@@ -1191,6 +1205,19 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	else
 		list_add(&queue.list, &sma->sem_pending);
 
+	if (nsops == 1) {
+		struct sem *curr;
+		curr = &sma->sem_base[sops->sem_num];
+
+		if (alter)
+			list_add_tail(&queue.simple_list, &curr->sem_pending);
+		else
+			list_add(&queue.simple_list, &curr->sem_pending);
+	} else {
+		INIT_LIST_HEAD(&queue.simple_list);
+		sma->complex_count++;
+	}
+
 	queue.status = -EINTR;
 	queue.sleeper = current;
 	current->state = TASK_INTERRUPTIBLE;
@@ -1232,7 +1259,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	 */
 	if (timeout && jiffies_left == 0)
 		error = -EAGAIN;
-	list_del(&queue.list);
+	unlink_queue(sma, &queue);
 
 out_unlock_free:
 	sem_unlock(sma);
@@ -1375,7 +1402,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
 	struct sem_array *sma = it;
 
 	return seq_printf(s,
-			  "%10d %10d  %4o %10lu %5u %5u %5u %5u %10lu %10lu\n",
+			  "%10d %10d  %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
 			  sma->sem_perm.key,
 			  sma->sem_perm.id,
 			  sma->sem_perm.mode,
-- 
cgit v1.2.3


From 9cf18e1dd74cd0061d58ac55029784ca3dd88f6a Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 15 Dec 2009 16:47:36 -0800
Subject: ipc: HARD_MSGMAX should be higher not lower on 64bit

We have HARD_MSGMAX lower on 64bit than on 32bit, since usually 64bit
machines have more memory than 32bit machines.

Making it higher on 64bit seems reasonable, and keep the original number
on 32bit.

Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e408722a84c7..07baa38bce37 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -87,7 +87,7 @@ extern int mq_init_ns(struct ipc_namespace *ns);
 /* default values */
 #define DFLT_QUEUESMAX 256     /* max number of message queues */
 #define DFLT_MSGMAX    10      /* max number of messages in each queue */
-#define HARD_MSGMAX    (131072/sizeof(void *))
+#define HARD_MSGMAX    (32768*sizeof(void *)/4)
 #define DFLT_MSGSIZEMAX 8192   /* max message size */
 #else
 static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; }
-- 
cgit v1.2.3


From 06a7f711246b081afc21fff859f1003f1f2a0fbc Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 15 Dec 2009 16:47:46 -0800
Subject: kexec: premit reduction of the reserved memory size

Implement shrinking the reserved memory for crash kernel, if it is more
than enough.

For example, if you have already reserved 128M, now you just want 100M,
you can do:

# echo $((100*1024*1024)) > /sys/kernel/kexec_crash_size

Note, you can only do this before loading the crash kernel.

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Neil Horman <nhorman@redhat.com>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kexec.h |  2 ++
 kernel/kexec.c        | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/ksysfs.c       | 21 ++++++++++++++++++
 3 files changed, 82 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index adc34f2c6eff..c356b6914ffd 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -206,6 +206,8 @@ extern size_t vmcoreinfo_max_size;
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
+int crash_shrink_memory(unsigned long new_size);
+size_t crash_get_memory_size(void);
 
 #else /* !CONFIG_KEXEC */
 struct pt_regs;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f336e2107f98..433e9fcc1fc5 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -31,6 +31,7 @@
 #include <linux/cpu.h>
 #include <linux/console.h>
 #include <linux/vmalloc.h>
+#include <linux/swap.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1082,6 +1083,64 @@ void crash_kexec(struct pt_regs *regs)
 	}
 }
 
+size_t crash_get_memory_size(void)
+{
+	size_t size;
+	mutex_lock(&kexec_mutex);
+	size = crashk_res.end - crashk_res.start + 1;
+	mutex_unlock(&kexec_mutex);
+	return size;
+}
+
+static void free_reserved_phys_range(unsigned long begin, unsigned long end)
+{
+	unsigned long addr;
+
+	for (addr = begin; addr < end; addr += PAGE_SIZE) {
+		ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
+		init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
+		free_page((unsigned long)__va(addr));
+		totalram_pages++;
+	}
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+	int ret = 0;
+	unsigned long start, end;
+
+	mutex_lock(&kexec_mutex);
+
+	if (kexec_crash_image) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+	start = crashk_res.start;
+	end = crashk_res.end;
+
+	if (new_size >= end - start + 1) {
+		ret = -EINVAL;
+		if (new_size == end - start + 1)
+			ret = 0;
+		goto unlock;
+	}
+
+	start = roundup(start, PAGE_SIZE);
+	end = roundup(start + new_size, PAGE_SIZE);
+
+	free_reserved_phys_range(end, crashk_res.end);
+
+	if (start == end) {
+		crashk_res.end = end;
+		release_resource(&crashk_res);
+	} else
+		crashk_res.end = end - 1;
+
+unlock:
+	mutex_unlock(&kexec_mutex);
+	return ret;
+}
+
 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
 			    size_t data_len)
 {
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 528dd78e7e7e..3feaf5a74514 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -100,6 +100,26 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
 }
 KERNEL_ATTR_RO(kexec_crash_loaded);
 
+static ssize_t kexec_crash_size_show(struct kobject *kobj,
+				       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%zu\n", crash_get_memory_size());
+}
+static ssize_t kexec_crash_size_store(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   const char *buf, size_t count)
+{
+	unsigned long cnt;
+	int ret;
+
+	if (strict_strtoul(buf, 0, &cnt))
+		return -EINVAL;
+
+	ret = crash_shrink_memory(cnt);
+	return ret < 0 ? ret : count;
+}
+KERNEL_ATTR_RW(kexec_crash_size);
+
 static ssize_t vmcoreinfo_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf)
 {
@@ -147,6 +167,7 @@ static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_KEXEC
 	&kexec_loaded_attr.attr,
 	&kexec_crash_loaded_attr.attr,
+	&kexec_crash_size_attr.attr,
 	&vmcoreinfo_attr.attr,
 #endif
 	NULL
-- 
cgit v1.2.3


From fac046ad0b1ee2c4244ebf43a26433ef0ea29ae4 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 15 Dec 2009 16:47:47 -0800
Subject: aio: remove unused field

Don't know the reason, but it appears ki_wait field of iocb never gets used.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c            | 40 ++--------------------------------------
 include/linux/aio.h |  4 ----
 2 files changed, 2 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index c30dfc006108..1cf12b3dd83a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -711,10 +711,8 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
 	 */
 	ret = retry(iocb);
 
-	if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
-		BUG_ON(!list_empty(&iocb->ki_wait.task_list));
+	if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED)
 		aio_complete(iocb, ret, 0);
-	}
 out:
 	spin_lock_irq(&ctx->ctx_lock);
 
@@ -866,13 +864,6 @@ static void try_queue_kicked_iocb(struct kiocb *iocb)
 	unsigned long flags;
 	int run = 0;
 
-	/* We're supposed to be the only path putting the iocb back on the run
-	 * list.  If we find that the iocb is *back* on a wait queue already
-	 * than retry has happened before we could queue the iocb.  This also
-	 * means that the retry could have completed and freed our iocb, no
-	 * good. */
-	BUG_ON((!list_empty(&iocb->ki_wait.task_list)));
-
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
 	/* set this inside the lock so that we can't race with aio_run_iocb()
 	 * testing it and putting the iocb on the run list under the lock */
@@ -886,7 +877,7 @@ static void try_queue_kicked_iocb(struct kiocb *iocb)
 /*
  * kick_iocb:
  *      Called typically from a wait queue callback context
- *      (aio_wake_function) to trigger a retry of the iocb.
+ *      to trigger a retry of the iocb.
  *      The retry is usually executed by aio workqueue
  *      threads (See aio_kick_handler).
  */
@@ -1520,31 +1511,6 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
 	return 0;
 }
 
-/*
- * aio_wake_function:
- * 	wait queue callback function for aio notification,
- * 	Simply triggers a retry of the operation via kick_iocb.
- *
- * 	This callback is specified in the wait queue entry in
- *	a kiocb.
- *
- * Note:
- * This routine is executed with the wait queue lock held.
- * Since kick_iocb acquires iocb->ctx->ctx_lock, it nests
- * the ioctx lock inside the wait queue lock. This is safe
- * because this callback isn't used for wait queues which
- * are nested inside ioctx lock (i.e. ctx->wait)
- */
-static int aio_wake_function(wait_queue_t *wait, unsigned mode,
-			     int sync, void *key)
-{
-	struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait);
-
-	list_del_init(&wait->task_list);
-	kick_iocb(iocb);
-	return 1;
-}
-
 static void aio_batch_add(struct address_space *mapping,
 			  struct hlist_head *batch_hash)
 {
@@ -1642,8 +1608,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf;
 	req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
 	req->ki_opcode = iocb->aio_lio_opcode;
-	init_waitqueue_func_entry(&req->ki_wait, aio_wake_function);
-	INIT_LIST_HEAD(&req->ki_wait.task_list);
 
 	ret = aio_setup_iocb(req);
 
diff --git a/include/linux/aio.h b/include/linux/aio.h
index aea219d7d8d1..811dbb369379 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -102,7 +102,6 @@ struct kiocb {
 	} ki_obj;
 
 	__u64			ki_user_data;	/* user's data for completion */
-	wait_queue_t		ki_wait;
 	loff_t			ki_pos;
 
 	void			*private;
@@ -140,7 +139,6 @@ struct kiocb {
 		(x)->ki_dtor = NULL;			\
 		(x)->ki_obj.tsk = tsk;			\
 		(x)->ki_user_data = 0;                  \
-		init_wait((&(x)->ki_wait));             \
 	} while (0)
 
 #define AIO_RING_MAGIC			0xa10a10a1
@@ -223,8 +221,6 @@ struct mm_struct;
 static inline void exit_aio(struct mm_struct *mm) { }
 #endif /* CONFIG_AIO */
 
-#define io_wait_to_kiocb(wait) container_of(wait, struct kiocb, ki_wait)
-
 static inline struct kiocb *list_kiocb(struct list_head *h)
 {
 	return list_entry(h, struct kiocb, ki_list);
-- 
cgit v1.2.3


From 5fe878ae7f82fbf0830dbfaee4c5ca18f3aee442 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 15 Dec 2009 16:47:50 -0800
Subject: direct-io: cleanup blockdev_direct_IO locking

Currently the locking in blockdev_direct_IO is a mess, we have three
different locking types and very confusing checks for some of them.  The
most complicated one is DIO_OWN_LOCKING for reads, which happens to not
actually be used.

This patch gets rid of the DIO_OWN_LOCKING - as mentioned above the read
case is unused anyway, and the write side is almost identical to
DIO_NO_LOCKING.  The difference is that DIO_NO_LOCKING always sets the
create argument for the get_blocks callback to zero, but we can easily
move that to the actual get_blocks callbacks.  There are four users of the
DIO_NO_LOCKING mode: gfs already ignores the create argument and thus is
fine with the new version, ocfs2 only errors out if create were ever set,
and we can remove this dead code now, the block device code only ever uses
create for an error message if we are fully beyond the device which can
never happen, and last but not least XFS will need the new behavour for
writes.

Now we can replace the lock_type variable with a flags one, where no flag
means the DIO_NO_LOCKING behaviour and DIO_LOCKING is kept as the first
flag.  Separate out the check for not allowing to fill holes into a
separate flag, although for now both flags always get set at the same
time.

Also revamp the documentation of the locking scheme to actually make
sense.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alex Elder <aelder@sgi.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/direct-io.c              | 129 ++++++++++++++++++--------------------------
 fs/ocfs2/aops.c             |  34 ++----------
 fs/xfs/linux-2.6/xfs_aops.c |  20 +++----
 include/linux/fs.h          |  22 +++-----
 4 files changed, 71 insertions(+), 134 deletions(-)

(limited to 'include/linux')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9f34bb9b1ecb..4012885d027f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
  *
  * If blkfactor is zero then the user's request was aligned to the filesystem's
  * blocksize.
- *
- * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
- * This determines whether we need to do the fancy locking which prevents
- * direct-IO from being able to read uninitialised disk blocks.  If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
- * not held for the entire direct write (taken briefly, initially, during a
- * direct read though, but its never held for the duration of a direct-IO).
  */
 
 struct dio {
@@ -68,7 +61,7 @@ struct dio {
 	struct inode *inode;
 	int rw;
 	loff_t i_size;			/* i_size when submitted */
-	int lock_type;			/* doesn't change */
+	int flags;			/* doesn't change */
 	unsigned blkbits;		/* doesn't change */
 	unsigned blkfactor;		/* When we're using an alignment which
 					   is finer than the filesystem's soft
@@ -246,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
 	if (dio->end_io && dio->result)
 		dio->end_io(dio->iocb, offset, transferred,
 			    dio->map_bh.b_private);
-	if (dio->lock_type == DIO_LOCKING)
+
+	if (dio->flags & DIO_LOCKING)
 		/* lockdep: non-owner release */
 		up_read_non_owner(&dio->inode->i_alloc_sem);
 
@@ -521,21 +515,24 @@ static int get_more_blocks(struct dio *dio)
 		map_bh->b_state = 0;
 		map_bh->b_size = fs_count << dio->inode->i_blkbits;
 
+		/*
+		 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
+		 * forbid block creations: only overwrites are permitted.
+		 * We will return early to the caller once we see an
+		 * unmapped buffer head returned, and the caller will fall
+		 * back to buffered I/O.
+		 *
+		 * Otherwise the decision is left to the get_blocks method,
+		 * which may decide to handle it or also return an unmapped
+		 * buffer head.
+		 */
 		create = dio->rw & WRITE;
-		if (dio->lock_type == DIO_LOCKING) {
+		if (dio->flags & DIO_SKIP_HOLES) {
 			if (dio->block_in_file < (i_size_read(dio->inode) >>
 							dio->blkbits))
 				create = 0;
-		} else if (dio->lock_type == DIO_NO_LOCKING) {
-			create = 0;
 		}
 
-		/*
-		 * For writes inside i_size we forbid block creations: only
-		 * overwrites are permitted.  We fall back to buffered writes
-		 * at a higher level for inside-i_size block-instantiating
-		 * writes.
-		 */
 		ret = (*dio->get_block)(dio->inode, fs_startblk,
 						map_bh, create);
 	}
@@ -1045,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 * we can let i_mutex go now that its achieved its purpose
 	 * of protecting us from looking up uninitialized blocks.
 	 */
-	if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
+	if (rw == READ && (dio->flags & DIO_LOCKING))
 		mutex_unlock(&dio->inode->i_mutex);
 
 	/*
@@ -1092,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 /*
  * This is a library function for use by filesystem drivers.
- * The locking rules are governed by the dio_lock_type parameter.
  *
- * DIO_NO_LOCKING (no locking, for raw block device access)
- * For writes, i_mutex is not held on entry; it is never taken.
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
  *
- * DIO_LOCKING (simple locking for regular files)
- * For writes we are called under i_mutex and return with i_mutex held, even
- * though it is internally dropped.
- * For reads, i_mutex is not held on entry, but it is taken and dropped before
- * returning.
- *
- * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
- *	uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_mutex, return without it, never touch it.
- * For reads we are called under i_mutex and return with i_mutex held, even
- * though it may be internally dropped.
- *
- * Additional i_alloc_sem locking requirements described inline below.
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
  */
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
 	unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-	int dio_lock_type)
+	int flags)
 {
 	int seg;
 	size_t size;
@@ -1126,8 +1121,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	ssize_t retval = -EINVAL;
 	loff_t end = offset;
 	struct dio *dio;
-	int release_i_mutex = 0;
-	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
 		rw = WRITE_ODIRECT_PLUG;
@@ -1168,43 +1161,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	 */
 	memset(dio, 0, offsetof(struct dio, pages));
 
-	/*
-	 * For block device access DIO_NO_LOCKING is used,
-	 *	neither readers nor writers do any locking at all
-	 * For regular files using DIO_LOCKING,
-	 *	readers need to grab i_mutex and i_alloc_sem
-	 *	writers need to grab i_alloc_sem only (i_mutex is already held)
-	 * For regular files using DIO_OWN_LOCKING,
-	 *	neither readers nor writers take any locks here
-	 */
-	dio->lock_type = dio_lock_type;
-	if (dio_lock_type != DIO_NO_LOCKING) {
+	dio->flags = flags;
+	if (dio->flags & DIO_LOCKING) {
 		/* watch out for a 0 len io from a tricksy fs */
 		if (rw == READ && end > offset) {
-			struct address_space *mapping;
+			struct address_space *mapping =
+					iocb->ki_filp->f_mapping;
 
-			mapping = iocb->ki_filp->f_mapping;
-			if (dio_lock_type != DIO_OWN_LOCKING) {
-				mutex_lock(&inode->i_mutex);
-				release_i_mutex = 1;
-			}
+			/* will be released by direct_io_worker */
+			mutex_lock(&inode->i_mutex);
 
 			retval = filemap_write_and_wait_range(mapping, offset,
 							      end - 1);
 			if (retval) {
+				mutex_unlock(&inode->i_mutex);
 				kfree(dio);
 				goto out;
 			}
-
-			if (dio_lock_type == DIO_OWN_LOCKING) {
-				mutex_unlock(&inode->i_mutex);
-				acquire_i_mutex = 1;
-			}
 		}
 
-		if (dio_lock_type == DIO_LOCKING)
-			/* lockdep: not the owner will release it */
-			down_read_non_owner(&inode->i_alloc_sem);
+		/*
+		 * Will be released at I/O completion, possibly in a
+		 * different thread.
+		 */
+		down_read_non_owner(&inode->i_alloc_sem);
 	}
 
 	/*
@@ -1222,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	/*
 	 * In case of error extending write may have instantiated a few
 	 * blocks outside i_size. Trim these off again for DIO_LOCKING.
-	 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
-	 * it's own meaner.
+	 *
+	 * NOTE: filesystems with their own locking have to handle this
+	 * on their own.
 	 */
-	if (unlikely(retval < 0 && (rw & WRITE))) {
-		loff_t isize = i_size_read(inode);
-
-		if (end > isize && dio_lock_type == DIO_LOCKING)
-			vmtruncate(inode, isize);
+	if (dio->flags & DIO_LOCKING) {
+		if (unlikely((rw & WRITE) && retval < 0)) {
+			loff_t isize = i_size_read(inode);
+			if (end > isize)
+				vmtruncate(inode, isize);
+		}
 	}
 
-	if (rw == READ && dio_lock_type == DIO_LOCKING)
-		release_i_mutex = 0;
-
 out:
-	if (release_i_mutex)
-		mutex_unlock(&inode->i_mutex);
-	else if (acquire_i_mutex)
-		mutex_lock(&inode->i_mutex);
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index deb2b132ae5e..3dae4a13f6e4 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -547,6 +547,9 @@ bail:
  *
  * called like this: dio->get_blocks(dio->inode, fs_startblk,
  * 					fs_count, map_bh, dio->rw == WRITE);
+ *
+ * Note that we never bother to allocate blocks here, and thus ignore the
+ * create argument.
  */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 				     struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 
 	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 
-	/*
-	 * Any write past EOF is not allowed because we'd be extending.
-	 */
-	if (create && (iblock + max_blocks) > inode_blocks) {
-		ret = -EIO;
-		goto bail;
-	}
-
 	/* This figures out the size of the next contiguous block, and
 	 * our logical offset */
 	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
-		ocfs2_error(inode->i_sb,
-			    "Inode %llu has a hole at block %llu\n",
-			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-			    (unsigned long long)iblock);
-		ret = -EROFS;
-		goto bail;
-	}
-
 	/* We should already CoW the refcounted extent. */
 	BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
 	/*
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 	 */
 	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
 		map_bh(bh_result, inode->i_sb, p_blkno);
-	else {
-		/*
-		 * ocfs2_prepare_inode_for_write() should have caught
-		 * the case where we'd be filling a hole and triggered
-		 * a buffered write instead.
-		 */
-		if (create) {
-			ret = -EIO;
-			mlog_errno(ret);
-			goto bail;
-		}
-
+	else
 		clear_buffer_mapped(bh_result);
-	}
 
 	/* make sure we don't map more than max_blocks blocks here as
 	   that's all the kernel will handle at this point. */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d798c54296eb..66abe36c1213 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1474,19 +1474,13 @@ xfs_vm_direct_IO(
 
 	bdev = xfs_find_bdev_for_inode(XFS_I(inode));
 
-	if (rw == WRITE) {
-		iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
-		ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
-			bdev, iov, offset, nr_segs,
-			xfs_get_blocks_direct,
-			xfs_end_io_direct);
-	} else {
-		iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
-		ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
-			bdev, iov, offset, nr_segs,
-			xfs_get_blocks_direct,
-			xfs_end_io_direct);
-	}
+	iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
+					IOMAP_UNWRITTEN : IOMAP_READ);
+
+	ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
+					    offset, nr_segs,
+					    xfs_get_blocks_direct,
+					    xfs_end_io_direct);
 
 	if (unlikely(ret != -EIOCBQUEUED && iocb->private))
 		xfs_destroy_ioend(iocb->private);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a057f48eb156..b23a7018eb90 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2264,9 +2264,11 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int lock_type);
 
 enum {
-	DIO_LOCKING = 1, /* need locking between buffered and direct access */
-	DIO_NO_LOCKING,  /* bdev; no locking at all between buffered/direct */
-	DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */
+	/* need locking between buffered and direct access */
+	DIO_LOCKING	= 0x01,
+
+	/* filesystem does not support filling holes */
+	DIO_SKIP_HOLES	= 0x02,
 };
 
 static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
@@ -2275,7 +2277,8 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb,
 	dio_iodone_t end_io)
 {
 	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-				nr_segs, get_block, end_io, DIO_LOCKING);
+				    nr_segs, get_block, end_io,
+				    DIO_LOCKING | DIO_SKIP_HOLES);
 }
 
 static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
@@ -2284,16 +2287,7 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
 	dio_iodone_t end_io)
 {
 	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-				nr_segs, get_block, end_io, DIO_NO_LOCKING);
-}
-
-static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb,
-	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
-	loff_t offset, unsigned long nr_segs, get_block_t get_block,
-	dio_iodone_t end_io)
-{
-	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
-				nr_segs, get_block, end_io, DIO_OWN_LOCKING);
+				nr_segs, get_block, end_io, 0);
 }
 #endif
 
-- 
cgit v1.2.3


From f65380c07b64b0e45ad5c7d70ca54a9cde1c00db Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Tue, 15 Dec 2009 16:48:21 -0800
Subject: resource: constify arg to resource_size() and resource_type()

resource_size() doesn't change the resource it operates on, so the res
parameter can be marked const.  Same for resource_type().

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ioport.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 83aa81297ea3..7129504e053d 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -126,11 +126,11 @@ extern int allocate_resource(struct resource *root, struct resource *new,
 int adjust_resource(struct resource *res, resource_size_t start,
 		    resource_size_t size);
 resource_size_t resource_alignment(struct resource *res);
-static inline resource_size_t resource_size(struct resource *res)
+static inline resource_size_t resource_size(const struct resource *res)
 {
 	return res->end - res->start + 1;
 }
-static inline unsigned long resource_type(struct resource *res)
+static inline unsigned long resource_type(const struct resource *res)
 {
 	return res->flags & IORESOURCE_TYPE_BITS;
 }
-- 
cgit v1.2.3


From c1a2a962a2ad103846e7950b4591471fabecece7 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 15 Dec 2009 16:48:25 -0800
Subject: bitmap: introduce bitmap_set, bitmap_clear,
 bitmap_find_next_zero_area

This introduces new bitmap functions:

bitmap_set: Set specified bit area
bitmap_clear: Clear specified bit area
bitmap_find_next_zero_area: Find free bit area

These are mostly stolen from iommu helper. The differences are:

- Use find_next_bit instead of doing test_bit for each bit

- Rewrite bitmap_set and bitmap_clear

  Instead of setting or clearing for each bit.

- Check the last bit of the limit

  iommu-helper doesn't want to find such area

- The return value if there is no zero area

  find_next_zero_area in iommu helper: returns -1
  bitmap_find_next_zero_area: return >= bitmap size

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Lothar Wassmann <LW@KARO-electronics.de>
Cc: Roland Dreier <rolandd@cisco.com>
Cc: Yevgeny Petrilin <yevgenyp@mellanox.co.il>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 11 +++++++
 lib/bitmap.c           | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 756d78b8c1c5..daf8c480c786 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -42,6 +42,9 @@
  * bitmap_empty(src, nbits)			Are all bits zero in *src?
  * bitmap_full(src, nbits)			Are all bits set in *src?
  * bitmap_weight(src, nbits)			Hamming Weight: number set bits
+ * bitmap_set(dst, pos, nbits)			Set specified bit area
+ * bitmap_clear(dst, pos, nbits)		Clear specified bit area
+ * bitmap_find_next_zero_area(buf, len, pos, n, mask)	Find bit free area
  * bitmap_shift_right(dst, src, n, nbits)	*dst = *src >> n
  * bitmap_shift_left(dst, src, n, nbits)	*dst = *src << n
  * bitmap_remap(dst, src, old, new, nbits)	*dst = map(old, new)(src)
@@ -108,6 +111,14 @@ extern int __bitmap_subset(const unsigned long *bitmap1,
 			const unsigned long *bitmap2, int bits);
 extern int __bitmap_weight(const unsigned long *bitmap, int bits);
 
+extern void bitmap_set(unsigned long *map, int i, int len);
+extern void bitmap_clear(unsigned long *map, int start, int nr);
+extern unsigned long bitmap_find_next_zero_area(unsigned long *map,
+					 unsigned long size,
+					 unsigned long start,
+					 unsigned int nr,
+					 unsigned long align_mask);
+
 extern int bitmap_scnprintf(char *buf, unsigned int len,
 			const unsigned long *src, int nbits);
 extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 702565821c99..11bf49750583 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -271,6 +271,87 @@ int __bitmap_weight(const unsigned long *bitmap, int bits)
 }
 EXPORT_SYMBOL(__bitmap_weight);
 
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
+
+void bitmap_set(unsigned long *map, int start, int nr)
+{
+	unsigned long *p = map + BIT_WORD(start);
+	const int size = start + nr;
+	int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+	unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+	while (nr - bits_to_set >= 0) {
+		*p |= mask_to_set;
+		nr -= bits_to_set;
+		bits_to_set = BITS_PER_LONG;
+		mask_to_set = ~0UL;
+		p++;
+	}
+	if (nr) {
+		mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+		*p |= mask_to_set;
+	}
+}
+EXPORT_SYMBOL(bitmap_set);
+
+void bitmap_clear(unsigned long *map, int start, int nr)
+{
+	unsigned long *p = map + BIT_WORD(start);
+	const int size = start + nr;
+	int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+	unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+	while (nr - bits_to_clear >= 0) {
+		*p &= ~mask_to_clear;
+		nr -= bits_to_clear;
+		bits_to_clear = BITS_PER_LONG;
+		mask_to_clear = ~0UL;
+		p++;
+	}
+	if (nr) {
+		mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+		*p &= ~mask_to_clear;
+	}
+}
+EXPORT_SYMBOL(bitmap_clear);
+
+/*
+ * bitmap_find_next_zero_area - find a contiguous aligned zero area
+ * @map: The address to base the search on
+ * @size: The bitmap size in bits
+ * @start: The bitnumber to start searching at
+ * @nr: The number of zeroed bits we're looking for
+ * @align_mask: Alignment mask for zero area
+ *
+ * The @align_mask should be one less than a power of 2; the effect is that
+ * the bit offset of all zero areas this function finds is multiples of that
+ * power of 2. A @align_mask of 0 means no alignment is required.
+ */
+unsigned long bitmap_find_next_zero_area(unsigned long *map,
+					 unsigned long size,
+					 unsigned long start,
+					 unsigned int nr,
+					 unsigned long align_mask)
+{
+	unsigned long index, end, i;
+again:
+	index = find_next_zero_bit(map, size, start);
+
+	/* Align allocation */
+	index = __ALIGN_MASK(index, align_mask);
+
+	end = index + nr;
+	if (end > size)
+		return end;
+	i = find_next_bit(map, end, index);
+	if (i < end) {
+		start = i + 1;
+		goto again;
+	}
+	return index;
+}
+EXPORT_SYMBOL(bitmap_find_next_zero_area);
+
 /*
  * Bitmap printing & parsing functions: first version by Bill Irwin,
  * second version by Paul Jackson, third by Joe Korty.
-- 
cgit v1.2.3


From a66022c457755b5eef61e30866114679c95e1f54 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 15 Dec 2009 16:48:28 -0800
Subject: iommu-helper: use bitmap library

Use bitmap library and kill some unused iommu helper functions.

1. s/iommu_area_free/bitmap_clear/

2. s/iommu_area_reserve/bitmap_set/

3. Use bitmap_find_next_zero_area instead of find_next_zero_area

  This cannot be simple substitution because find_next_zero_area
  doesn't check the last bit of the limit in bitmap

4. Remove iommu_area_free, iommu_area_reserve, and find_next_zero_area

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/iommu.c      |  4 +--
 arch/sparc/kernel/iommu.c        |  3 +-
 arch/x86/kernel/amd_iommu.c      |  4 +--
 arch/x86/kernel/pci-calgary_64.c |  6 ++--
 arch/x86/kernel/pci-gart_64.c    |  6 ++--
 include/linux/iommu-helper.h     |  3 --
 lib/iommu-helper.c               | 59 ++++++----------------------------------
 7 files changed, 21 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index fd51578e29dd..5547ae6e6b0b 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -30,7 +30,7 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/dma-mapping.h>
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
 #include <linux/iommu-helper.h>
 #include <linux/crash_dump.h>
 #include <asm/io.h>
@@ -251,7 +251,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	}
 
 	ppc_md.tce_free(tbl, entry, npages);
-	iommu_area_free(tbl->it_map, free_entry, npages);
+	bitmap_clear(tbl->it_map, free_entry, npages);
 }
 
 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index 7690cc219ecc..5fad94950e76 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -11,6 +11,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/errno.h>
 #include <linux/iommu-helper.h>
+#include <linux/bitmap.h>
 
 #ifdef CONFIG_PCI
 #include <linux/pci.h>
@@ -169,7 +170,7 @@ void iommu_range_free(struct iommu *iommu, dma_addr_t dma_addr, unsigned long np
 
 	entry = (dma_addr - iommu->page_table_map_base) >> IO_PAGE_SHIFT;
 
-	iommu_area_free(arena->map, entry, npages);
+	bitmap_clear(arena->map, entry, npages);
 }
 
 int iommu_table_init(struct iommu *iommu, int tsbsize,
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index b990b5cc9541..23824fef789c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -19,7 +19,7 @@
 
 #include <linux/pci.h>
 #include <linux/gfp.h>
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
 #include <linux/debugfs.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-mapping.h>
@@ -1162,7 +1162,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 
 	address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
 
-	iommu_area_free(range->bitmap, address, pages);
+	bitmap_clear(range->bitmap, address, pages);
 
 }
 
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index c563e4c8ff39..2bbde6078143 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -31,7 +31,7 @@
 #include <linux/string.h>
 #include <linux/crash_dump.h>
 #include <linux/dma-mapping.h>
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
 #include <linux/pci_ids.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
@@ -212,7 +212,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
 
 	spin_lock_irqsave(&tbl->it_lock, flags);
 
-	iommu_area_reserve(tbl->it_map, index, npages);
+	bitmap_set(tbl->it_map, index, npages);
 
 	spin_unlock_irqrestore(&tbl->it_lock, flags);
 }
@@ -303,7 +303,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 
 	spin_lock_irqsave(&tbl->it_lock, flags);
 
-	iommu_area_free(tbl->it_map, entry, npages);
+	bitmap_clear(tbl->it_map, entry, npages);
 
 	spin_unlock_irqrestore(&tbl->it_lock, flags);
 }
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 56c0e730d3fe..34de53b46f87 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -23,7 +23,7 @@
 #include <linux/module.h>
 #include <linux/topology.h>
 #include <linux/interrupt.h>
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
 #include <linux/kdebug.h>
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
@@ -126,7 +126,7 @@ static void free_iommu(unsigned long offset, int size)
 	unsigned long flags;
 
 	spin_lock_irqsave(&iommu_bitmap_lock, flags);
-	iommu_area_free(iommu_gart_bitmap, offset, size);
+	bitmap_clear(iommu_gart_bitmap, offset, size);
 	if (offset >= next_bit)
 		next_bit = offset + size;
 	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -792,7 +792,7 @@ int __init gart_iommu_init(void)
 	 * Out of IOMMU space handling.
 	 * Reserve some invalid pages at the beginning of the GART.
 	 */
-	iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
+	bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
 
 	pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
 	       iommu_size >> 20);
diff --git a/include/linux/iommu-helper.h b/include/linux/iommu-helper.h
index 3b068e5b5671..64d1b638745d 100644
--- a/include/linux/iommu-helper.h
+++ b/include/linux/iommu-helper.h
@@ -14,14 +14,11 @@ static inline unsigned long iommu_device_max_index(unsigned long size,
 extern int iommu_is_span_boundary(unsigned int index, unsigned int nr,
 				  unsigned long shift,
 				  unsigned long boundary_size);
-extern void iommu_area_reserve(unsigned long *map, unsigned long i, int len);
 extern unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
 				      unsigned long start, unsigned int nr,
 				      unsigned long shift,
 				      unsigned long boundary_size,
 				      unsigned long align_mask);
-extern void iommu_area_free(unsigned long *map, unsigned long start,
-			    unsigned int nr);
 
 extern unsigned long iommu_num_pages(unsigned long addr, unsigned long len,
 				     unsigned long io_page_size);
diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
index 75dbda03f4fb..c0251f4ad08b 100644
--- a/lib/iommu-helper.c
+++ b/lib/iommu-helper.c
@@ -3,41 +3,7 @@
  */
 
 #include <linux/module.h>
-#include <linux/bitops.h>
-
-static unsigned long find_next_zero_area(unsigned long *map,
-					 unsigned long size,
-					 unsigned long start,
-					 unsigned int nr,
-					 unsigned long align_mask)
-{
-	unsigned long index, end, i;
-again:
-	index = find_next_zero_bit(map, size, start);
-
-	/* Align allocation */
-	index = (index + align_mask) & ~align_mask;
-
-	end = index + nr;
-	if (end >= size)
-		return -1;
-	for (i = index; i < end; i++) {
-		if (test_bit(i, map)) {
-			start = i+1;
-			goto again;
-		}
-	}
-	return index;
-}
-
-void iommu_area_reserve(unsigned long *map, unsigned long i, int len)
-{
-	unsigned long end = i + len;
-	while (i < end) {
-		__set_bit(i, map);
-		i++;
-	}
-}
+#include <linux/bitmap.h>
 
 int iommu_is_span_boundary(unsigned int index, unsigned int nr,
 			   unsigned long shift,
@@ -55,31 +21,24 @@ unsigned long iommu_area_alloc(unsigned long *map, unsigned long size,
 			       unsigned long align_mask)
 {
 	unsigned long index;
+
+	/* We don't want the last of the limit */
+	size -= 1;
 again:
-	index = find_next_zero_area(map, size, start, nr, align_mask);
-	if (index != -1) {
+	index = bitmap_find_next_zero_area(map, size, start, nr, align_mask);
+	if (index < size) {
 		if (iommu_is_span_boundary(index, nr, shift, boundary_size)) {
 			/* we could do more effectively */
 			start = index + 1;
 			goto again;
 		}
-		iommu_area_reserve(map, index, nr);
+		bitmap_set(map, index, nr);
+		return index;
 	}
-	return index;
+	return -1;
 }
 EXPORT_SYMBOL(iommu_area_alloc);
 
-void iommu_area_free(unsigned long *map, unsigned long start, unsigned int nr)
-{
-	unsigned long end = start + nr;
-
-	while (start < end) {
-		__clear_bit(start, map);
-		start++;
-	}
-}
-EXPORT_SYMBOL(iommu_area_free);
-
 unsigned long iommu_num_pages(unsigned long addr, unsigned long len,
 			      unsigned long io_page_size)
 {
-- 
cgit v1.2.3