From 174e964ec224c3c591b83a6b5f0984d905d3678f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 9 Oct 2014 12:43:27 -0700 Subject: regulator: Include err.h from consumer.h to fix build failure sh:sh2007_defconfig fails to build with the following error: In file included from include/linux/regulator/machine.h:18:0, from arch/sh/boards/board-sh2007.c:10: include/linux/regulator/consumer.h: In function 'regulator_get_optional': include/linux/regulator/consumer.h:271:2: error: implicit declaration of function 'ERR_PTR' include/linux/err.h: At top level: include/linux/err.h:23:35: error: conflicting types for 'ERR_PTR' include/linux/regulator/consumer.h:271:9: note: previous implicit declaration of 'ERR_PTR' was here Since consumer.h uses ERR_PTR, it should include err.h. Signed-off-by: Guenter Roeck Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index d347c805f923..f540b1496e2f 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -35,6 +35,8 @@ #ifndef __LINUX_REGULATOR_CONSUMER_H_ #define __LINUX_REGULATOR_CONSUMER_H_ +#include + struct device; struct notifier_block; struct regmap; -- cgit v1.2.3 From 70f3ce0510afdad7cbaf27ab7ab961377205c782 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 29 Sep 2014 11:47:54 +0200 Subject: mtd: spi-nor: make spi_nor_scan() take a chip type name, not spi_device_id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drivers currently call spi_nor_match_id() and then spi_nor_scan(). This adds a dependency on struct spi_device_id which we want to avoid. Make spi_nor_scan() do it for them. Signed-off-by: Ben Hutchings Signed-off-by: Rafał Miłecki Signed-off-by: Brian Norris --- drivers/mtd/devices/m25p80.c | 4 +--- drivers/mtd/spi-nor/fsl-quadspi.c | 7 +------ drivers/mtd/spi-nor/spi-nor.c | 13 +++++++++---- include/linux/mtd/spi-nor.h | 20 +++----------------- 4 files changed, 14 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index 822209d10689..bd5e4c6edfd4 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -193,7 +193,6 @@ static int m25p_probe(struct spi_device *spi) { struct mtd_part_parser_data ppdata; struct flash_platform_data *data; - const struct spi_device_id *id = NULL; struct m25p *flash; struct spi_nor *nor; enum read_mode mode = SPI_NOR_NORMAL; @@ -241,8 +240,7 @@ static int m25p_probe(struct spi_device *spi) else flash_name = spi->modalias; - id = spi_nor_match_id(flash_name); - ret = spi_nor_scan(nor, id, mode); + ret = spi_nor_scan(nor, flash_name, mode); if (ret) return ret; diff --git a/drivers/mtd/spi-nor/fsl-quadspi.c b/drivers/mtd/spi-nor/fsl-quadspi.c index 8d659a2888d5..d5269a26c839 100644 --- a/drivers/mtd/spi-nor/fsl-quadspi.c +++ b/drivers/mtd/spi-nor/fsl-quadspi.c @@ -881,7 +881,6 @@ static int fsl_qspi_probe(struct platform_device *pdev) /* iterate the subnodes. */ for_each_available_child_of_node(dev->of_node, np) { - const struct spi_device_id *id; char modalias[40]; /* skip the holes */ @@ -909,10 +908,6 @@ static int fsl_qspi_probe(struct platform_device *pdev) if (of_modalias_node(np, modalias, sizeof(modalias)) < 0) goto map_failed; - id = spi_nor_match_id(modalias); - if (!id) - goto map_failed; - ret = of_property_read_u32(np, "spi-max-frequency", &q->clk_rate); if (ret < 0) @@ -921,7 +916,7 @@ static int fsl_qspi_probe(struct platform_device *pdev) /* set the chip address for READID */ fsl_qspi_set_base_addr(q, nor); - ret = spi_nor_scan(nor, id, SPI_NOR_QUAD); + ret = spi_nor_scan(nor, modalias, SPI_NOR_QUAD); if (ret) goto map_failed; diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index ae16aa2f6885..5c8e39977bc5 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -28,6 +28,8 @@ #define JEDEC_MFR(_jedec_id) ((_jedec_id) >> 16) +static const struct spi_device_id *spi_nor_match_id(const char *name); + /* * Read the status register, returning its value in the location * Return the status register value. @@ -911,9 +913,9 @@ static int spi_nor_check(struct spi_nor *nor) return 0; } -int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id, - enum read_mode mode) +int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode) { + const struct spi_device_id *id = NULL; struct flash_info *info; struct device *dev = nor->dev; struct mtd_info *mtd = nor->mtd; @@ -925,6 +927,10 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id, if (ret) return ret; + id = spi_nor_match_id(name); + if (!id) + return -ENOENT; + info = (void *)id->driver_data; if (info->jedec_id) { @@ -1113,7 +1119,7 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id, } EXPORT_SYMBOL_GPL(spi_nor_scan); -const struct spi_device_id *spi_nor_match_id(char *name) +static const struct spi_device_id *spi_nor_match_id(const char *name) { const struct spi_device_id *id = spi_nor_ids; @@ -1124,7 +1130,6 @@ const struct spi_device_id *spi_nor_match_id(char *name) } return NULL; } -EXPORT_SYMBOL_GPL(spi_nor_match_id); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Huang Shijie "); diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 9e6294f32ba8..a5a7a086748d 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -187,32 +187,18 @@ struct spi_nor { /** * spi_nor_scan() - scan the SPI NOR * @nor: the spi_nor structure - * @id: the spi_device_id provided by the driver + * @name: the chip type name * @mode: the read mode supported by the driver * * The drivers can use this fuction to scan the SPI NOR. * In the scanning, it will try to get all the necessary information to * fill the mtd_info{} and the spi_nor{}. * - * The board may assigns a spi_device_id with @id which be used to compared with - * the spi_device_id detected by the scanning. + * The chip type name can be provided through the @name parameter. * * Return: 0 for success, others for failure. */ -int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id, - enum read_mode mode); +int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode); extern const struct spi_device_id spi_nor_ids[]; -/** - * spi_nor_match_id() - find the spi_device_id by the name - * @name: the name of the spi_device_id - * - * The drivers use this function to find the spi_device_id - * specified by the @name. - * - * Return: returns the right spi_device_id pointer on success, - * and returns NULL on failure. - */ -const struct spi_device_id *spi_nor_match_id(char *name); - #endif -- cgit v1.2.3 From a5b7616c55e188fe3d6ef686bef402d4703ecb62 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 30 Sep 2014 03:14:55 +0100 Subject: mtd: m25p80,spi-nor: Fix module aliases for m25p80 m25p80's device ID table is now spi_nor_ids, defined in spi-nor. The MODULE_DEVICE_TABLE() macro doesn't work with extern definitions, but its use was also removed at the same time. Now if m25p80 is built as a module it doesn't get the necessary aliases to be loaded automatically. A clean solution to this will involve defining the list of device IDs in spi-nor.h and removing struct spi_device_id from the spi-nor API, but this is quite a large change. As a quick fix suitable for stable, copy the device IDs back into m25p80. Fixes: 03e296f613af ("mtd: m25p80: use the SPI nor framework") Cc: # 3.16.x: 32f1b7c8352f: mtd: move support for struct flash_platform_data into m25p80 Cc: # 3.16.x: 90e55b3812a1: mtd: m25p80: get rid of spi_get_device_id Cc: # 3.16.x: 70f3ce0510af: mtd: spi-nor: make spi_nor_scan() take a chip type name, not spi_device_id Cc: # 3.16.x Signed-off-by: Ben Hutchings Signed-off-by: Brian Norris --- drivers/mtd/devices/m25p80.c | 52 ++++++++++++++++++++++++++++++++++++++++++- drivers/mtd/spi-nor/spi-nor.c | 3 +-- include/linux/mtd/spi-nor.h | 1 - 3 files changed, 52 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index bd5e4c6edfd4..ed827cf894e4 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -261,12 +261,62 @@ static int m25p_remove(struct spi_device *spi) } +/* + * XXX This needs to be kept in sync with spi_nor_ids. We can't share + * it with spi-nor, because if this is built as a module then modpost + * won't be able to read it and add appropriate aliases. + */ +static const struct spi_device_id m25p_ids[] = { + {"at25fs010"}, {"at25fs040"}, {"at25df041a"}, {"at25df321a"}, + {"at25df641"}, {"at26f004"}, {"at26df081a"}, {"at26df161a"}, + {"at26df321"}, {"at45db081d"}, + {"en25f32"}, {"en25p32"}, {"en25q32b"}, {"en25p64"}, + {"en25q64"}, {"en25qh128"}, {"en25qh256"}, + {"f25l32pa"}, + {"mr25h256"}, {"mr25h10"}, + {"gd25q32"}, {"gd25q64"}, + {"160s33b"}, {"320s33b"}, {"640s33b"}, + {"mx25l2005a"}, {"mx25l4005a"}, {"mx25l8005"}, {"mx25l1606e"}, + {"mx25l3205d"}, {"mx25l3255e"}, {"mx25l6405d"}, {"mx25l12805d"}, + {"mx25l12855e"},{"mx25l25635e"},{"mx25l25655e"},{"mx66l51235l"}, + {"mx66l1g55g"}, + {"n25q064"}, {"n25q128a11"}, {"n25q128a13"}, {"n25q256a"}, + {"n25q512a"}, {"n25q512ax3"}, {"n25q00"}, + {"pm25lv512"}, {"pm25lv010"}, {"pm25lq032"}, + {"s25sl032p"}, {"s25sl064p"}, {"s25fl256s0"}, {"s25fl256s1"}, + {"s25fl512s"}, {"s70fl01gs"}, {"s25sl12800"}, {"s25sl12801"}, + {"s25fl129p0"}, {"s25fl129p1"}, {"s25sl004a"}, {"s25sl008a"}, + {"s25sl016a"}, {"s25sl032a"}, {"s25sl064a"}, {"s25fl008k"}, + {"s25fl016k"}, {"s25fl064k"}, + {"sst25vf040b"},{"sst25vf080b"},{"sst25vf016b"},{"sst25vf032b"}, + {"sst25vf064c"},{"sst25wf512"}, {"sst25wf010"}, {"sst25wf020"}, + {"sst25wf040"}, + {"m25p05"}, {"m25p10"}, {"m25p20"}, {"m25p40"}, + {"m25p80"}, {"m25p16"}, {"m25p32"}, {"m25p64"}, + {"m25p128"}, {"n25q032"}, + {"m25p05-nonjedec"}, {"m25p10-nonjedec"}, {"m25p20-nonjedec"}, + {"m25p40-nonjedec"}, {"m25p80-nonjedec"}, {"m25p16-nonjedec"}, + {"m25p32-nonjedec"}, {"m25p64-nonjedec"}, {"m25p128-nonjedec"}, + {"m45pe10"}, {"m45pe80"}, {"m45pe16"}, + {"m25pe20"}, {"m25pe80"}, {"m25pe16"}, + {"m25px16"}, {"m25px32"}, {"m25px32-s0"}, {"m25px32-s1"}, + {"m25px64"}, + {"w25x10"}, {"w25x20"}, {"w25x40"}, {"w25x80"}, + {"w25x16"}, {"w25x32"}, {"w25q32"}, {"w25q32dw"}, + {"w25x64"}, {"w25q64"}, {"w25q128"}, {"w25q80"}, + {"w25q80bl"}, {"w25q128"}, {"w25q256"}, {"cat25c11"}, + {"cat25c03"}, {"cat25c09"}, {"cat25c17"}, {"cat25128"}, + { }, +}; +MODULE_DEVICE_TABLE(spi, m25p_ids); + + static struct spi_driver m25p80_driver = { .driver = { .name = "m25p80", .owner = THIS_MODULE, }, - .id_table = spi_nor_ids, + .id_table = m25p_ids, .probe = m25p_probe, .remove = m25p_remove, diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index 5c8e39977bc5..c51ee52386a7 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -475,7 +475,7 @@ struct flash_info { * more nor chips. This current list focusses on newer chips, which * have been converging on command sets which including JEDEC ID. */ -const struct spi_device_id spi_nor_ids[] = { +static const struct spi_device_id spi_nor_ids[] = { /* Atmel -- some are (confusingly) marketed as "DataFlash" */ { "at25fs010", INFO(0x1f6601, 0, 32 * 1024, 4, SECT_4K) }, { "at25fs040", INFO(0x1f6604, 0, 64 * 1024, 8, SECT_4K) }, @@ -639,7 +639,6 @@ const struct spi_device_id spi_nor_ids[] = { { "cat25128", CAT25_INFO(2048, 8, 64, 2, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) }, { }, }; -EXPORT_SYMBOL_GPL(spi_nor_ids); static const struct spi_device_id *spi_nor_read_id(struct spi_nor *nor) { diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index a5a7a086748d..046a0a2e4c4e 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -199,6 +199,5 @@ struct spi_nor { * Return: 0 for success, others for failure. */ int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode); -extern const struct spi_device_id spi_nor_ids[]; #endif -- cgit v1.2.3 From dda02fd6278d9e995850b3c1dba484f17cbe4de4 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Fri, 24 Oct 2014 17:47:57 +0800 Subject: mm, cma: make parameters order consistent in func declaration and definition In the current code, the base and size parameters order is not consistent in functions declaration and definition. If someone calls these functions according to the declaration parameters order in cma.h, he will run into some bug and it's hard to find the reason. This patch makes the parameters order consistent in functions declaration and definition. Signed-off-by: Weijie Yang Acked-by: Michal Nazarewicz Signed-off-by: Marek Szyprowski --- include/linux/cma.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cma.h b/include/linux/cma.h index 0430ed05d3b9..a93438beb33c 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -18,12 +18,12 @@ struct cma; extern phys_addr_t cma_get_base(struct cma *cma); extern unsigned long cma_get_size(struct cma *cma); -extern int __init cma_declare_contiguous(phys_addr_t size, - phys_addr_t base, phys_addr_t limit, +extern int __init cma_declare_contiguous(phys_addr_t base, + phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, struct cma **res_cma); -extern int cma_init_reserved_mem(phys_addr_t size, - phys_addr_t base, int order_per_bit, +extern int cma_init_reserved_mem(phys_addr_t base, + phys_addr_t size, int order_per_bit, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align); extern bool cma_release(struct cma *cma, struct page *pages, int count); -- cgit v1.2.3 From a69d82b9bdf1e53e94423048e8bda8c5f5a3dd4e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 7 Oct 2014 17:47:36 +0200 Subject: power_supply: Add no_thermal property to prevent recursive get_temp calls Add a 'no_thermal' property to the power supply class. If true then thermal zone won't be created for this power supply in power_supply_register(). Power supply drivers may want to set it if they support POWER_SUPPLY_PROP_TEMP and they are forwarding this get property call to other thermal zone. If they won't set it lockdep may report false positive deadlock for thermal zone's mutex because of nested calls to thermal_zone_get_temp(). First is the call to thermal_zone_get_temp() of the driver's thermal zone. Thermal core gets POWER_SUPPLY_PROP_TEMP property from this driver. The driver then calls other thermal zone thermal_zone_get_temp() and returns result. Example of such driver is charger manager. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Sebastian Reichel --- drivers/power/power_supply_core.c | 3 +++ include/linux/power_supply.h | 6 ++++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c index 6cb7fe5c022d..694e8cddd5c1 100644 --- a/drivers/power/power_supply_core.c +++ b/drivers/power/power_supply_core.c @@ -417,6 +417,9 @@ static int psy_register_thermal(struct power_supply *psy) { int i; + if (psy->no_thermal) + return 0; + /* Register battery zone device psy reports temperature */ for (i = 0; i < psy->num_properties; i++) { if (psy->properties[i] == POWER_SUPPLY_PROP_TEMP) { diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 3ed049673022..096dbced02ac 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -200,6 +200,12 @@ struct power_supply { void (*external_power_changed)(struct power_supply *psy); void (*set_charged)(struct power_supply *psy); + /* + * Set if thermal zone should not be created for this power supply. + * For example for virtual supplies forwarding calls to actual + * sensors or other supplies. + */ + bool no_thermal; /* For APM emulation, think legacy userspace. */ int use_for_apm; -- cgit v1.2.3 From bdbe81445407644492b9ac69a24d35e3202d773b Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 13 Oct 2014 15:34:30 +0200 Subject: power: charger-manager: Fix accessing invalidated power supply after fuel gauge unbind The charger manager obtained reference to fuel gauge power supply in probe with power_supply_get_by_name() for later usage. However if fuel gauge driver was removed and re-added then this reference would point to old power supply (from driver which was removed). This lead to accessing old (and probably invalid) memory which could be observed with: $ echo "12-0036" > /sys/bus/i2c/drivers/max17042/unbind $ echo "12-0036" > /sys/bus/i2c/drivers/max17042/bind $ cat /sys/devices/virtual/power_supply/battery/capacity [ 240.480084] INFO: task cat:1393 blocked for more than 120 seconds. [ 240.484799] Not tainted 3.17.0-next-20141007-00028-ge60b6dd79570 #203 [ 240.491782] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 240.499589] cat D c0469530 0 1393 1 0x00000000 [ 240.505947] [] (__schedule) from [] (schedule_preempt_disabled+0x14/0x20) [ 240.514449] [] (schedule_preempt_disabled) from [] (mutex_lock_nested+0x1bc/0x458) [ 240.523736] [] (mutex_lock_nested) from [] (regmap_read+0x30/0x60) [ 240.531647] [] (regmap_read) from [] (max17042_get_property+0x2e8/0x350) [ 240.540055] [] (max17042_get_property) from [] (charger_get_property+0x264/0x348) [ 240.549252] [] (charger_get_property) from [] (power_supply_show_property+0x48/0x1e0) [ 240.558808] [] (power_supply_show_property) from [] (dev_attr_show+0x1c/0x48) [ 240.567664] [] (dev_attr_show) from [] (sysfs_kf_seq_show+0x84/0x104) [ 240.575814] [] (sysfs_kf_seq_show) from [] (kernfs_seq_show+0x24/0x28) [ 240.584061] [] (kernfs_seq_show) from [] (seq_read+0x1b0/0x484) [ 240.591702] [] (seq_read) from [] (vfs_read+0x88/0x144) [ 240.598640] [] (vfs_read) from [] (SyS_read+0x40/0x8c) [ 240.605507] [] (SyS_read) from [] (ret_fast_syscall+0x0/0x48) [ 240.612952] 4 locks held by cat/1393: [ 240.616589] #0: (&p->lock){+.+.+.}, at: [] seq_read+0x30/0x484 [ 240.623414] #1: (&of->mutex){+.+.+.}, at: [] kernfs_seq_start+0x1c/0x8c [ 240.631086] #2: (s_active#31){++++.+}, at: [] kernfs_seq_start+0x24/0x8c [ 240.638777] #3: (&map->mutex){+.+...}, at: [] regmap_read+0x30/0x60 The charger-manager should get reference to fuel gauge power supply on each use of get_property callback. The thermal zone 'tzd' field of power supply should not be used because of the same reason. Additionally this change solves also the issue with nested thermal_zone_get_temp() calls and related false lockdep positive for deadlock for thermal zone's mutex [1]. When fuel gauge is used as source of temperature then the charger manager forwards its get_temp calls to fuel gauge thermal zone. So actually different mutexes are used (one for charger manager thermal zone and second for fuel gauge thermal zone) but for lockdep this is one class of mutex. The recursion is removed by retrieving temperature through power supply's get_property(). In case external thermal zone is used ('cm-thermal-zone' property is present in DTS) the recursion does not exist. Charger manager simply exports POWER_SUPPLY_PROP_TEMP_AMBIENT property (instead of POWER_SUPPLY_PROP_TEMP) thus no thermal zone is created for this power supply. [1] https://lkml.org/lkml/2014/10/6/309 Signed-off-by: Krzysztof Kozlowski Cc: Fixes: 3bb3dbbd56ea ("power_supply: Add initial Charger-Manager driver") Signed-off-by: Sebastian Reichel --- drivers/power/charger-manager.c | 99 +++++++++++++++++++++++++---------- include/linux/power/charger-manager.h | 1 - 2 files changed, 71 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/drivers/power/charger-manager.c b/drivers/power/charger-manager.c index 7a1177ea238d..7ae8cb70204a 100644 --- a/drivers/power/charger-manager.c +++ b/drivers/power/charger-manager.c @@ -97,6 +97,7 @@ static struct charger_global_desc *g_desc; /* init with setup_charger_manager */ static bool is_batt_present(struct charger_manager *cm) { union power_supply_propval val; + struct power_supply *psy; bool present = false; int i, ret; @@ -107,7 +108,11 @@ static bool is_batt_present(struct charger_manager *cm) case CM_NO_BATTERY: break; case CM_FUEL_GAUGE: - ret = cm->fuel_gauge->get_property(cm->fuel_gauge, + psy = power_supply_get_by_name(cm->desc->psy_fuel_gauge); + if (!psy) + break; + + ret = psy->get_property(psy, POWER_SUPPLY_PROP_PRESENT, &val); if (ret == 0 && val.intval) present = true; @@ -167,12 +172,14 @@ static bool is_ext_pwr_online(struct charger_manager *cm) static int get_batt_uV(struct charger_manager *cm, int *uV) { union power_supply_propval val; + struct power_supply *fuel_gauge; int ret; - if (!cm->fuel_gauge) + fuel_gauge = power_supply_get_by_name(cm->desc->psy_fuel_gauge); + if (!fuel_gauge) return -ENODEV; - ret = cm->fuel_gauge->get_property(cm->fuel_gauge, + ret = fuel_gauge->get_property(fuel_gauge, POWER_SUPPLY_PROP_VOLTAGE_NOW, &val); if (ret) return ret; @@ -248,6 +255,7 @@ static bool is_full_charged(struct charger_manager *cm) { struct charger_desc *desc = cm->desc; union power_supply_propval val; + struct power_supply *fuel_gauge; int ret = 0; int uV; @@ -255,11 +263,15 @@ static bool is_full_charged(struct charger_manager *cm) if (!is_batt_present(cm)) return false; - if (cm->fuel_gauge && desc->fullbatt_full_capacity > 0) { + fuel_gauge = power_supply_get_by_name(cm->desc->psy_fuel_gauge); + if (!fuel_gauge) + return false; + + if (desc->fullbatt_full_capacity > 0) { val.intval = 0; /* Not full if capacity of fuel gauge isn't full */ - ret = cm->fuel_gauge->get_property(cm->fuel_gauge, + ret = fuel_gauge->get_property(fuel_gauge, POWER_SUPPLY_PROP_CHARGE_FULL, &val); if (!ret && val.intval > desc->fullbatt_full_capacity) return true; @@ -273,10 +285,10 @@ static bool is_full_charged(struct charger_manager *cm) } /* Full, if the capacity is more than fullbatt_soc */ - if (cm->fuel_gauge && desc->fullbatt_soc > 0) { + if (desc->fullbatt_soc > 0) { val.intval = 0; - ret = cm->fuel_gauge->get_property(cm->fuel_gauge, + ret = fuel_gauge->get_property(fuel_gauge, POWER_SUPPLY_PROP_CAPACITY, &val); if (!ret && val.intval >= desc->fullbatt_soc) return true; @@ -551,6 +563,20 @@ static int check_charging_duration(struct charger_manager *cm) return ret; } +static int cm_get_battery_temperature_by_psy(struct charger_manager *cm, + int *temp) +{ + struct power_supply *fuel_gauge; + + fuel_gauge = power_supply_get_by_name(cm->desc->psy_fuel_gauge); + if (!fuel_gauge) + return -ENODEV; + + return fuel_gauge->get_property(fuel_gauge, + POWER_SUPPLY_PROP_TEMP, + (union power_supply_propval *)temp); +} + static int cm_get_battery_temperature(struct charger_manager *cm, int *temp) { @@ -560,15 +586,18 @@ static int cm_get_battery_temperature(struct charger_manager *cm, return -ENODEV; #ifdef CONFIG_THERMAL - ret = thermal_zone_get_temp(cm->tzd_batt, (unsigned long *)temp); - if (!ret) - /* Calibrate temperature unit */ - *temp /= 100; -#else - ret = cm->fuel_gauge->get_property(cm->fuel_gauge, - POWER_SUPPLY_PROP_TEMP, - (union power_supply_propval *)temp); + if (cm->tzd_batt) { + ret = thermal_zone_get_temp(cm->tzd_batt, (unsigned long *)temp); + if (!ret) + /* Calibrate temperature unit */ + *temp /= 100; + } else #endif + { + /* if-else continued from CONFIG_THERMAL */ + ret = cm_get_battery_temperature_by_psy(cm, temp); + } + return ret; } @@ -827,6 +856,7 @@ static int charger_get_property(struct power_supply *psy, struct charger_manager *cm = container_of(psy, struct charger_manager, charger_psy); struct charger_desc *desc = cm->desc; + struct power_supply *fuel_gauge; int ret = 0; int uV; @@ -857,14 +887,20 @@ static int charger_get_property(struct power_supply *psy, ret = get_batt_uV(cm, &val->intval); break; case POWER_SUPPLY_PROP_CURRENT_NOW: - ret = cm->fuel_gauge->get_property(cm->fuel_gauge, + fuel_gauge = power_supply_get_by_name(cm->desc->psy_fuel_gauge); + if (!fuel_gauge) { + ret = -ENODEV; + break; + } + ret = fuel_gauge->get_property(fuel_gauge, POWER_SUPPLY_PROP_CURRENT_NOW, val); break; case POWER_SUPPLY_PROP_TEMP: case POWER_SUPPLY_PROP_TEMP_AMBIENT: return cm_get_battery_temperature(cm, &val->intval); case POWER_SUPPLY_PROP_CAPACITY: - if (!cm->fuel_gauge) { + fuel_gauge = power_supply_get_by_name(cm->desc->psy_fuel_gauge); + if (!fuel_gauge) { ret = -ENODEV; break; } @@ -875,7 +911,7 @@ static int charger_get_property(struct power_supply *psy, break; } - ret = cm->fuel_gauge->get_property(cm->fuel_gauge, + ret = fuel_gauge->get_property(fuel_gauge, POWER_SUPPLY_PROP_CAPACITY, val); if (ret) break; @@ -924,7 +960,14 @@ static int charger_get_property(struct power_supply *psy, break; case POWER_SUPPLY_PROP_CHARGE_NOW: if (is_charging(cm)) { - ret = cm->fuel_gauge->get_property(cm->fuel_gauge, + fuel_gauge = power_supply_get_by_name( + cm->desc->psy_fuel_gauge); + if (!fuel_gauge) { + ret = -ENODEV; + break; + } + + ret = fuel_gauge->get_property(fuel_gauge, POWER_SUPPLY_PROP_CHARGE_NOW, val); if (ret) { @@ -1486,14 +1529,15 @@ err: return ret; } -static int cm_init_thermal_data(struct charger_manager *cm) +static int cm_init_thermal_data(struct charger_manager *cm, + struct power_supply *fuel_gauge) { struct charger_desc *desc = cm->desc; union power_supply_propval val; int ret; /* Verify whether fuel gauge provides battery temperature */ - ret = cm->fuel_gauge->get_property(cm->fuel_gauge, + ret = fuel_gauge->get_property(fuel_gauge, POWER_SUPPLY_PROP_TEMP, &val); if (!ret) { @@ -1503,8 +1547,6 @@ static int cm_init_thermal_data(struct charger_manager *cm) cm->desc->measure_battery_temp = true; } #ifdef CONFIG_THERMAL - cm->tzd_batt = cm->fuel_gauge->tzd; - if (ret && desc->thermal_zone) { cm->tzd_batt = thermal_zone_get_zone_by_name(desc->thermal_zone); @@ -1667,6 +1709,7 @@ static int charger_manager_probe(struct platform_device *pdev) int ret = 0, i = 0; int j = 0; union power_supply_propval val; + struct power_supply *fuel_gauge; if (g_desc && !rtc_dev && g_desc->rtc_name) { rtc_dev = rtc_class_open(g_desc->rtc_name); @@ -1745,8 +1788,8 @@ static int charger_manager_probe(struct platform_device *pdev) } } - cm->fuel_gauge = power_supply_get_by_name(desc->psy_fuel_gauge); - if (!cm->fuel_gauge) { + fuel_gauge = power_supply_get_by_name(desc->psy_fuel_gauge); + if (!fuel_gauge) { dev_err(&pdev->dev, "Cannot find power supply \"%s\"\n", desc->psy_fuel_gauge); return -ENODEV; @@ -1789,13 +1832,13 @@ static int charger_manager_probe(struct platform_device *pdev) cm->charger_psy.num_properties = psy_default.num_properties; /* Find which optional psy-properties are available */ - if (!cm->fuel_gauge->get_property(cm->fuel_gauge, + if (!fuel_gauge->get_property(fuel_gauge, POWER_SUPPLY_PROP_CHARGE_NOW, &val)) { cm->charger_psy.properties[cm->charger_psy.num_properties] = POWER_SUPPLY_PROP_CHARGE_NOW; cm->charger_psy.num_properties++; } - if (!cm->fuel_gauge->get_property(cm->fuel_gauge, + if (!fuel_gauge->get_property(fuel_gauge, POWER_SUPPLY_PROP_CURRENT_NOW, &val)) { cm->charger_psy.properties[cm->charger_psy.num_properties] = @@ -1803,7 +1846,7 @@ static int charger_manager_probe(struct platform_device *pdev) cm->charger_psy.num_properties++; } - ret = cm_init_thermal_data(cm); + ret = cm_init_thermal_data(cm, fuel_gauge); if (ret) { dev_err(&pdev->dev, "Failed to initialize thermal data\n"); cm->desc->measure_battery_temp = false; diff --git a/include/linux/power/charger-manager.h b/include/linux/power/charger-manager.h index 07e7945a1ff2..5d90d329e0ba 100644 --- a/include/linux/power/charger-manager.h +++ b/include/linux/power/charger-manager.h @@ -253,7 +253,6 @@ struct charger_manager { struct device *dev; struct charger_desc *desc; - struct power_supply *fuel_gauge; struct power_supply **charger_stat; #ifdef CONFIG_THERMAL -- cgit v1.2.3 From cdaf3e15385d3232b52287e50692506f8fd01a09 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 13 Oct 2014 15:34:31 +0200 Subject: power: charger-manager: Fix accessing invalidated power supply after charger unbind The charger manager obtained in probe references to power supplies for all chargers with power_supply_get_by_name() for later usage. However if such charger driver was removed then this reference would point to old power supply (from driver which was removed). This lead to accessing invalid memory which could be observed with: $ echo "max77693-charger" > /sys/bus/platform/drivers/max77693-charger/unbind $ grep . /sys/devices/virtual/power_supply/battery/charger.0/* $ grep . /sys/devices/virtual/power_supply/battery/* [ 15.339817] Unable to handle kernel paging request at virtual address 0001c12c [ 15.346187] pgd = edd08000 [ 15.348814] [0001c12c] *pgd=6dce2831, *pte=00000000, *ppte=00000000 [ 15.355075] Internal error: Oops: 80000007 [#1] PREEMPT SMP ARM [ 15.360967] Modules linked in: [ 15.364010] CPU: 2 PID: 1388 Comm: grep Not tainted 3.17.0-next-20141007-00027-ga95e761db1b0 #245 [ 15.372859] task: ee03ad00 ti: edcf6000 task.ti: edcf6000 [ 15.378241] PC is at 0x1c12c [ 15.381113] LR is at is_ext_pwr_online+0x30/0x6c [ 15.385706] pc : [<0001c12c>] lr : [] psr: a0000013 [ 15.385706] sp : edcf7e88 ip : 00000000 fp : 00000000 [ 15.397161] r10: eeb02c08 r9 : c04b1f84 r8 : eeb02c00 [ 15.402369] r7 : edc69a10 r6 : eea6ac10 r5 : eea6ac10 r4 : 00000004 [ 15.408878] r3 : 0001c12c r2 : edcf7e8c r1 : 00000004 r0 : ee914418 [ 15.415390] Flags: NzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user [ 15.422506] Control: 10c5387d Table: 6dd0804a DAC: 00000015 [ 15.428236] Process grep (pid: 1388, stack limit = 0xedcf6240) [ 15.434050] Stack: (0xedcf7e88 to 0xedcf8000) [ 15.438395] 7e80: ee03ad00 00000000 edcf7f80 eea6aca8 edcf7ec4 c033b7b0 [ 15.446554] 7ea0: 00000001 ee1cc3f0 00000004 c06e1e44 eebdc000 c06e1e44 eeb02c00 c0337144 [ 15.454713] 7ec0: ee2dac68 c005cffc ee1cc3c0 c06e1e44 00000fff 00001000 eebdc000 c0278ca8 [ 15.462872] 7ee0: c0278c8c ee1cc3c0 eeb7ce00 c014422c edcf7f20 00008000 ee1cc3c0 ee9a48c0 [ 15.471030] 7f00: 00000001 00000001 edcf7f80 c0142d94 c0142d70 c01060f4 00021000 ee1cc3f0 [ 15.479190] 7f20: 00000000 00000000 c06a2150 eebdc000 2e7ec000 ee9a48c0 00008000 00021000 [ 15.487349] 7f40: edcf7f80 00008000 edcf6000 00021000 00021000 c00e39a4 00000000 ee9a48c0 [ 15.495508] 7f60: 00004000 00000000 00000000 ee9a48c0 ee9a48c0 00008000 00021000 c00e3aa0 [ 15.503668] 7f80: 00000000 00000000 0001f2e0 0001f2e0 00021000 00001000 00000003 c000f364 [ 15.511826] 7fa0: 00000000 c000f1a0 0001f2e0 00021000 00000003 00021000 00008000 00000000 [ 15.519986] 7fc0: 0001f2e0 00021000 00001000 00000003 00000001 000205e8 00000000 00021000 [ 15.528145] 7fe0: 00008000 bebbe910 0000a7ad b6edc49c 60000010 00000003 aaaaaaaa aaaaaaaa [ 15.536320] [] (is_ext_pwr_online) from [] (charger_get_property+0x170/0x314) [ 15.545164] [] (charger_get_property) from [] (power_supply_show_property+0x48/0x20c) [ 15.554719] [] (power_supply_show_property) from [] (dev_attr_show+0x1c/0x48) [ 15.563577] [] (dev_attr_show) from [] (sysfs_kf_seq_show+0x84/0x104) [ 15.571725] [] (sysfs_kf_seq_show) from [] (kernfs_seq_show+0x24/0x28) [ 15.579973] [] (kernfs_seq_show) from [] (seq_read+0x1b0/0x484) [ 15.587614] [] (seq_read) from [] (vfs_read+0x88/0x144) [ 15.594552] [] (vfs_read) from [] (SyS_read+0x40/0x8c) [ 15.601417] [] (SyS_read) from [] (ret_fast_syscall+0x0/0x48) [ 15.608877] Code: bad PC value [ 15.611991] ---[ end trace a88fcc95208db283 ]--- The charger-manager should get reference to charger power supply on each use of get_property callback. Signed-off-by: Krzysztof Kozlowski Cc: Fixes: 3bb3dbbd56ea ("power_supply: Add initial Charger-Manager driver") Signed-off-by: Sebastian Reichel --- drivers/power/charger-manager.c | 64 +++++++++++++++++++++-------------- include/linux/power/charger-manager.h | 2 -- 2 files changed, 39 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/power/charger-manager.c b/drivers/power/charger-manager.c index 7ae8cb70204a..ef8094a61f1e 100644 --- a/drivers/power/charger-manager.c +++ b/drivers/power/charger-manager.c @@ -118,10 +118,17 @@ static bool is_batt_present(struct charger_manager *cm) present = true; break; case CM_CHARGER_STAT: - for (i = 0; cm->charger_stat[i]; i++) { - ret = cm->charger_stat[i]->get_property( - cm->charger_stat[i], - POWER_SUPPLY_PROP_PRESENT, &val); + for (i = 0; cm->desc->psy_charger_stat[i]; i++) { + psy = power_supply_get_by_name( + cm->desc->psy_charger_stat[i]); + if (!psy) { + dev_err(cm->dev, "Cannot find power supply \"%s\"\n", + cm->desc->psy_charger_stat[i]); + continue; + } + + ret = psy->get_property(psy, POWER_SUPPLY_PROP_PRESENT, + &val); if (ret == 0 && val.intval) { present = true; break; @@ -144,14 +151,20 @@ static bool is_batt_present(struct charger_manager *cm) static bool is_ext_pwr_online(struct charger_manager *cm) { union power_supply_propval val; + struct power_supply *psy; bool online = false; int i, ret; /* If at least one of them has one, it's yes. */ - for (i = 0; cm->charger_stat[i]; i++) { - ret = cm->charger_stat[i]->get_property( - cm->charger_stat[i], - POWER_SUPPLY_PROP_ONLINE, &val); + for (i = 0; cm->desc->psy_charger_stat[i]; i++) { + psy = power_supply_get_by_name(cm->desc->psy_charger_stat[i]); + if (!psy) { + dev_err(cm->dev, "Cannot find power supply \"%s\"\n", + cm->desc->psy_charger_stat[i]); + continue; + } + + ret = psy->get_property(psy, POWER_SUPPLY_PROP_ONLINE, &val); if (ret == 0 && val.intval) { online = true; break; @@ -196,6 +209,7 @@ static bool is_charging(struct charger_manager *cm) { int i, ret; bool charging = false; + struct power_supply *psy; union power_supply_propval val; /* If there is no battery, it cannot be charged */ @@ -203,17 +217,22 @@ static bool is_charging(struct charger_manager *cm) return false; /* If at least one of the charger is charging, return yes */ - for (i = 0; cm->charger_stat[i]; i++) { + for (i = 0; cm->desc->psy_charger_stat[i]; i++) { /* 1. The charger sholuld not be DISABLED */ if (cm->emergency_stop) continue; if (!cm->charger_enabled) continue; + psy = power_supply_get_by_name(cm->desc->psy_charger_stat[i]); + if (!psy) { + dev_err(cm->dev, "Cannot find power supply \"%s\"\n", + cm->desc->psy_charger_stat[i]); + continue; + } + /* 2. The charger should be online (ext-power) */ - ret = cm->charger_stat[i]->get_property( - cm->charger_stat[i], - POWER_SUPPLY_PROP_ONLINE, &val); + ret = psy->get_property(psy, POWER_SUPPLY_PROP_ONLINE, &val); if (ret) { dev_warn(cm->dev, "Cannot read ONLINE value from %s\n", cm->desc->psy_charger_stat[i]); @@ -226,9 +245,7 @@ static bool is_charging(struct charger_manager *cm) * 3. The charger should not be FULL, DISCHARGING, * or NOT_CHARGING. */ - ret = cm->charger_stat[i]->get_property( - cm->charger_stat[i], - POWER_SUPPLY_PROP_STATUS, &val); + ret = psy->get_property(psy, POWER_SUPPLY_PROP_STATUS, &val); if (ret) { dev_warn(cm->dev, "Cannot read STATUS value from %s\n", cm->desc->psy_charger_stat[i]); @@ -1773,15 +1790,12 @@ static int charger_manager_probe(struct platform_device *pdev) while (desc->psy_charger_stat[i]) i++; - cm->charger_stat = devm_kzalloc(&pdev->dev, - sizeof(struct power_supply *) * i, GFP_KERNEL); - if (!cm->charger_stat) - return -ENOMEM; - + /* Check if charger's supplies are present at probe */ for (i = 0; desc->psy_charger_stat[i]; i++) { - cm->charger_stat[i] = power_supply_get_by_name( - desc->psy_charger_stat[i]); - if (!cm->charger_stat[i]) { + struct power_supply *psy; + + psy = power_supply_get_by_name(desc->psy_charger_stat[i]); + if (!psy) { dev_err(&pdev->dev, "Cannot find power supply \"%s\"\n", desc->psy_charger_stat[i]); return -ENODEV; @@ -2110,8 +2124,8 @@ static bool find_power_supply(struct charger_manager *cm, int i; bool found = false; - for (i = 0; cm->charger_stat[i]; i++) { - if (psy == cm->charger_stat[i]) { + for (i = 0; cm->desc->psy_charger_stat[i]; i++) { + if (!strcmp(psy->name, cm->desc->psy_charger_stat[i])) { found = true; break; } diff --git a/include/linux/power/charger-manager.h b/include/linux/power/charger-manager.h index 5d90d329e0ba..e97fc656a058 100644 --- a/include/linux/power/charger-manager.h +++ b/include/linux/power/charger-manager.h @@ -253,8 +253,6 @@ struct charger_manager { struct device *dev; struct charger_desc *desc; - struct power_supply **charger_stat; - #ifdef CONFIG_THERMAL struct thermal_zone_device *tzd_batt; #endif -- cgit v1.2.3 From e999dbc254044e8d2a5818d92d205f65bae28f37 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 19 Oct 2014 17:13:57 +0200 Subject: Revert "block: all blk-mq requests are tagged" This reverts commit fb3ccb5da71273e7f0d50b50bc879e50cedd60e7. SCSI-2/SPI actually needs the tagged/untagged flag in the request to work properly. Revert this patch and add a follow on to set it in the right place. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Jens Axboe Reported-by: Meelis Roos Tested-by: Meelis Roos Cc: stable@vger.kernel.org --- include/linux/blkdev.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0207a78a8d82..51d0dc2259cf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1136,8 +1136,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) /* * tag stuff */ -#define blk_rq_tagged(rq) \ - ((rq)->mq_ctx || ((rq)->cmd_flags & REQ_QUEUED)) +#define blk_rq_tagged(rq) ((rq)->cmd_flags & REQ_QUEUED) extern int blk_queue_start_tag(struct request_queue *, struct request *); extern struct request *blk_queue_find_tag(struct request_queue *, int); extern void blk_queue_end_tag(struct request_queue *, struct request *); -- cgit v1.2.3 From 5631b8fba640a4ab2f8a954f63a603fa34eda96b Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Sat, 25 Oct 2014 15:09:42 -0700 Subject: compiler/gcc4+: Remove inaccurate comment about 'asm goto' miscompiles The bug referenced by the comment in this commit was not completely fixed in GCC 4.8.2, as I mentioned in a thread back in February: https://lkml.org/lkml/2014/2/12/797 The conclusion at that time was to make the quirk unconditional until the bug could be found and fixed in GCC. Unfortunately, when I submitted the patch (commit a9f18034) I left a comment in that claimed the bug was fixed in GCC 4.8.2+. This comment is inaccurate, and should be removed. Signed-off-by: Steven Noonan Signed-off-by: Ingo Molnar Cc: Jakub Jelinek Cc: Richard Henderson Cc: Linus Torvalds Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1414274982-14040-1-git-send-email-steven@uplinklabs.net Cc: Ingo Molnar --- include/linux/compiler-gcc4.h | 1 - include/linux/compiler-gcc5.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h index 2507fd2a1eb4..d1a558239b1a 100644 --- a/include/linux/compiler-gcc4.h +++ b/include/linux/compiler-gcc4.h @@ -71,7 +71,6 @@ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 * * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * Fixed in GCC 4.8.2 and later versions. * * (asm goto is automatically volatile - the naming reflects this.) */ diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h index cdd1cc202d51..c8c565952548 100644 --- a/include/linux/compiler-gcc5.h +++ b/include/linux/compiler-gcc5.h @@ -53,7 +53,6 @@ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 * * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * Fixed in GCC 4.8.2 and later versions. * * (asm goto is automatically volatile - the naming reflects this.) */ -- cgit v1.2.3 From ebcf34f3d4be11f994340aff629f3c17171a4f65 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 26 Oct 2014 19:14:06 -0700 Subject: skbuff.h: fix kernel-doc warning for headers_end Fix kernel-doc warning in by making both headers_start and headers_end private fields. Warning(..//include/linux/skbuff.h:654): No description found for parameter 'headers_end[0]' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a59d9343c25b..5884f95ff0e9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -557,7 +557,9 @@ struct sk_buff { /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */ + /* private: */ __u32 headers_start[0]; + /* public: */ /* if you move pkt_type around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD @@ -642,7 +644,9 @@ struct sk_buff { __u16 network_header; __u16 mac_header; + /* private: */ __u32 headers_end[0]; + /* public: */ /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; -- cgit v1.2.3 From 1efed2d06c703489342ab6af2951683e07509c99 Mon Sep 17 00:00:00 2001 From: Olivier Blin Date: Fri, 24 Oct 2014 19:43:00 +0200 Subject: usbnet: add a callback for set_rx_mode To delegate promiscuous mode and multicast filtering to the subdriver. Signed-off-by: Olivier Blin Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 20 ++++++++++++++++++++ include/linux/usb/usbnet.h | 4 ++++ 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 20615bbd693b..3a6770a65d78 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1052,6 +1052,21 @@ static void __handle_link_change(struct usbnet *dev) clear_bit(EVENT_LINK_CHANGE, &dev->flags); } +static void usbnet_set_rx_mode(struct net_device *net) +{ + struct usbnet *dev = netdev_priv(net); + + usbnet_defer_kevent(dev, EVENT_SET_RX_MODE); +} + +static void __handle_set_rx_mode(struct usbnet *dev) +{ + if (dev->driver_info->set_rx_mode) + (dev->driver_info->set_rx_mode)(dev); + + clear_bit(EVENT_SET_RX_MODE, &dev->flags); +} + /* work that cannot be done in interrupt context uses keventd. * * NOTE: with 2.5 we could do more of this using completion callbacks, @@ -1157,6 +1172,10 @@ skip_reset: if (test_bit (EVENT_LINK_CHANGE, &dev->flags)) __handle_link_change(dev); + if (test_bit (EVENT_SET_RX_MODE, &dev->flags)) + __handle_set_rx_mode(dev); + + if (dev->flags) netdev_dbg(dev->net, "kevent done, flags = 0x%lx\n", dev->flags); } @@ -1525,6 +1544,7 @@ static const struct net_device_ops usbnet_netdev_ops = { .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, + .ndo_set_rx_mode = usbnet_set_rx_mode, .ndo_change_mtu = usbnet_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 26088feb6608..d9a4905e01d0 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -78,6 +78,7 @@ struct usbnet { # define EVENT_NO_RUNTIME_PM 9 # define EVENT_RX_KILL 10 # define EVENT_LINK_CHANGE 11 +# define EVENT_SET_RX_MODE 12 }; static inline struct usb_driver *driver_of(struct usb_interface *intf) @@ -159,6 +160,9 @@ struct driver_info { /* called by minidriver when receiving indication */ void (*indication)(struct usbnet *dev, void *ind, int indlen); + /* rx mode change (device changes address list filtering) */ + void (*set_rx_mode)(struct usbnet *dev); + /* for new devices, use the descriptor-reading code instead */ int in; /* rx endpoint */ int out; /* tx endpoint */ -- cgit v1.2.3 From 54ef6df3f3f1353d99c80c437259d317b2cd1cbd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Oct 2014 21:11:27 -0700 Subject: rcu: Provide counterpart to rcu_dereference() for non-RCU situations Although rcu_dereference() and friends can be used in situations where object lifetimes are being managed by something other than RCU, the resulting sparse and lockdep-RCU noise can be annoying. This commit therefore supplies a lockless_dereference(), which provides the protection for dereferences without the RCU-related debugging noise. Reported-by: Al Viro Signed-off-by: Paul E. McKenney Signed-off-by: Al Viro --- include/linux/rcupdate.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a4a819ffb2d1..53ff1a752d7e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -616,6 +616,21 @@ static inline void rcu_preempt_sleep_check(void) */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) +/** + * lockless_dereference() - safely load a pointer for later dereference + * @p: The pointer to load + * + * Similar to rcu_dereference(), but for situations where the pointed-to + * object's lifetime is managed by something other than RCU. That + * "something other" might be reference counting or simple immortality. + */ +#define lockless_dereference(p) \ +({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ + (_________p1); \ +}) + /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to -- cgit v1.2.3 From d1b72cc6d8cb766c802fdc70a5edc2f0ba8a2b57 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 27 Oct 2014 15:42:01 +0100 Subject: overlayfs: fix lockdep misannotation In an overlay directory that shadows an empty lower directory, say /mnt/a/empty102, do: touch /mnt/a/empty102/x unlink /mnt/a/empty102/x rmdir /mnt/a/empty102 It's actually harmless, but needs another level of nesting between I_MUTEX_CHILD and I_MUTEX_NORMAL. Signed-off-by: Miklos Szeredi Tested-by: David Howells Signed-off-by: Al Viro --- fs/namei.c | 2 +- fs/overlayfs/readdir.c | 2 +- include/linux/fs.h | 9 ++++++--- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 42df664e95e5..922f27068c4c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2497,7 +2497,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) } mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2); return NULL; } EXPORT_SYMBOL(lock_rename); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 3fbf0d306e12..401f0840f5cc 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -571,7 +571,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; - mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD); list_for_each_entry(p, list, l_node) { struct dentry *dentry; diff --git a/include/linux/fs.h b/include/linux/fs.h index 4e41a4a331bb..01036262095f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -639,11 +639,13 @@ static inline int inode_unhashed(struct inode *inode) * 2: child/target * 3: xattr * 4: second non-directory - * The last is for certain operations (such as rename) which lock two + * 5: second parent (when locking independent directories in rename) + * + * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is - * parent -> child -> normal -> xattr -> second non-directory + * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory */ enum inode_i_mutex_lock_class { @@ -651,7 +653,8 @@ enum inode_i_mutex_lock_class I_MUTEX_PARENT, I_MUTEX_CHILD, I_MUTEX_XATTR, - I_MUTEX_NONDIR2 + I_MUTEX_NONDIR2, + I_MUTEX_PARENT2, }; void lock_two_nondirectories(struct inode *, struct inode*); -- cgit v1.2.3 From cb1a5ab6ece7a37da4ac98ee26b0475b7c3ea79e Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 28 Oct 2014 20:27:43 -0600 Subject: block: Fix merge logic when CONFIG_BLK_DEV_INTEGRITY is not defined Commit 4eaf99beadce switched to returning bool and as a result reversed the logic of the integrity merge checks. However, the empty stubs used when the block integrity code is compiled out were still returning 0. Make these stubs return "true". Signed-off-by: Martin K. Petersen Reported-by: Michael L. Semon Tested-by: Michael L. Semon Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0207a78a8d82..6cbee8395f60 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1583,13 +1583,13 @@ static inline bool blk_integrity_merge_rq(struct request_queue *rq, struct request *r1, struct request *r2) { - return 0; + return true; } static inline bool blk_integrity_merge_bio(struct request_queue *rq, struct request *r, struct bio *b) { - return 0; + return true; } static inline bool blk_integrity_is_initialized(struct gendisk *g) { -- cgit v1.2.3 From 47f29df7db78ee4fcdb104cf36918d987ddd0278 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 29 Oct 2014 14:50:29 -0700 Subject: drivers: of: add return value to of_reserved_mem_device_init() Driver calling of_reserved_mem_device_init() might be interested if the initialization has been successful or not, so add support for returning error code. This fixes a build warining caused by commit 7bfa5ab6fa1b ("drivers: dma-coherent: add initialization from device tree"), which has been merged without this change and without fixing function return value. Fixes: 7bfa5ab6fa1b1 ("drivers: dma-coherent: add initialization from device tree") Signed-off-by: Marek Szyprowski Acked-by: Arnd Bergmann Cc: Michal Nazarewicz Cc: Grant Likely Cc: Laura Abbott Cc: Josh Cartwright Cc: Joonsoo Kim Cc: Kyungmin Park Cc: Russell King Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/dma-contiguous.c | 3 ++- drivers/of/of_reserved_mem.c | 14 +++++++++----- include/linux/of_reserved_mem.h | 9 ++++++--- 3 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 473ff4892401..950fff9ce453 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -223,9 +223,10 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, #undef pr_fmt #define pr_fmt(fmt) fmt -static void rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) +static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) { dev_set_cma_area(dev, rmem->priv); + return 0; } static void rmem_cma_device_release(struct reserved_mem *rmem, diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 59fb12e84e6b..dc566b38645f 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -243,23 +243,27 @@ static inline struct reserved_mem *__find_rmem(struct device_node *node) * This function assign memory region pointed by "memory-region" device tree * property to the given device. */ -void of_reserved_mem_device_init(struct device *dev) +int of_reserved_mem_device_init(struct device *dev) { struct reserved_mem *rmem; struct device_node *np; + int ret; np = of_parse_phandle(dev->of_node, "memory-region", 0); if (!np) - return; + return -ENODEV; rmem = __find_rmem(np); of_node_put(np); if (!rmem || !rmem->ops || !rmem->ops->device_init) - return; + return -EINVAL; + + ret = rmem->ops->device_init(rmem, dev); + if (ret == 0) + dev_info(dev, "assigned reserved memory node %s\n", rmem->name); - rmem->ops->device_init(rmem, dev); - dev_info(dev, "assigned reserved memory node %s\n", rmem->name); + return ret; } /** diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h index 5b5efae09135..ad2f67054372 100644 --- a/include/linux/of_reserved_mem.h +++ b/include/linux/of_reserved_mem.h @@ -16,7 +16,7 @@ struct reserved_mem { }; struct reserved_mem_ops { - void (*device_init)(struct reserved_mem *rmem, + int (*device_init)(struct reserved_mem *rmem, struct device *dev); void (*device_release)(struct reserved_mem *rmem, struct device *dev); @@ -28,14 +28,17 @@ typedef int (*reservedmem_of_init_fn)(struct reserved_mem *rmem); _OF_DECLARE(reservedmem, name, compat, init, reservedmem_of_init_fn) #ifdef CONFIG_OF_RESERVED_MEM -void of_reserved_mem_device_init(struct device *dev); +int of_reserved_mem_device_init(struct device *dev); void of_reserved_mem_device_release(struct device *dev); void fdt_init_reserved_mem(void); void fdt_reserved_mem_save_node(unsigned long node, const char *uname, phys_addr_t base, phys_addr_t size); #else -static inline void of_reserved_mem_device_init(struct device *dev) { } +static inline int of_reserved_mem_device_init(struct device *dev) +{ + return -ENOSYS; +} static inline void of_reserved_mem_device_release(struct device *pdev) { } static inline void fdt_init_reserved_mem(void) { } -- cgit v1.2.3 From 6d50e60cd2edb5a57154db5a6f64eef5aa59b751 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 29 Oct 2014 14:50:31 -0700 Subject: mm, thp: fix collapsing of hugepages on madvise If an anonymous mapping is not allowed to fault thp memory and then madvise(MADV_HUGEPAGE) is used after fault, khugepaged will never collapse this memory into thp memory. This occurs because the madvise(2) handler for thp, hugepage_madvise(), clears VM_NOHUGEPAGE on the stack and it isn't stored in vma->vm_flags until the final action of madvise_behavior(). This causes the khugepaged_enter_vma_merge() to be a no-op in hugepage_madvise() when the vma had previously had VM_NOHUGEPAGE set. Fix this by passing the correct vma flags to the khugepaged mm slot handler. There's no chance khugepaged can run on this vma until after madvise_behavior() returns since we hold mm->mmap_sem. It would be possible to clear VM_NOHUGEPAGE directly from vma->vm_flags in hugepage_advise(), but I didn't want to introduce special case behavior into madvise_behavior(). I think it's best to just let it always set vma->vm_flags itself. Signed-off-by: David Rientjes Reported-by: Suleiman Souhlal Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/khugepaged.h | 17 ++++++++++------- mm/huge_memory.c | 11 ++++++----- mm/mmap.c | 8 ++++---- 3 files changed, 20 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 6b394f0b5148..eeb307985715 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -6,7 +6,8 @@ #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); -extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma); +extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags); #define khugepaged_enabled() \ (transparent_hugepage_flags & \ @@ -35,13 +36,13 @@ static inline void khugepaged_exit(struct mm_struct *mm) __khugepaged_exit(mm); } -static inline int khugepaged_enter(struct vm_area_struct *vma) +static inline int khugepaged_enter(struct vm_area_struct *vma, + unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) if ((khugepaged_always() || - (khugepaged_req_madv() && - vma->vm_flags & VM_HUGEPAGE)) && - !(vma->vm_flags & VM_NOHUGEPAGE)) + (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && + !(vm_flags & VM_NOHUGEPAGE)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; @@ -54,11 +55,13 @@ static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) static inline void khugepaged_exit(struct mm_struct *mm) { } -static inline int khugepaged_enter(struct vm_area_struct *vma) +static inline int khugepaged_enter(struct vm_area_struct *vma, + unsigned long vm_flags) { return 0; } -static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 780d12c000e9..de984159cf0b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -803,7 +803,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma))) + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { @@ -1970,7 +1970,7 @@ int hugepage_madvise(struct vm_area_struct *vma, * register it here without waiting a page fault that * may not happen any time soon. */ - if (unlikely(khugepaged_enter_vma_merge(vma))) + if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) return -ENOMEM; break; case MADV_NOHUGEPAGE: @@ -2071,7 +2071,8 @@ int __khugepaged_enter(struct mm_struct *mm) return 0; } -int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { unsigned long hstart, hend; if (!vma->anon_vma) @@ -2083,11 +2084,11 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); + VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) - return khugepaged_enter(vma); + return khugepaged_enter(vma, vm_flags); return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 7f855206e7fb..87e82b38453c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1080,7 +1080,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL); if (err) return NULL; - khugepaged_enter_vma_merge(prev); + khugepaged_enter_vma_merge(prev, vm_flags); return prev; } @@ -1099,7 +1099,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, next->vm_pgoff - pglen, NULL); if (err) return NULL; - khugepaged_enter_vma_merge(area); + khugepaged_enter_vma_merge(area, vm_flags); return area; } @@ -2208,7 +2208,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } vma_unlock_anon_vma(vma); - khugepaged_enter_vma_merge(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(vma->vm_mm); return error; } @@ -2277,7 +2277,7 @@ int expand_downwards(struct vm_area_struct *vma, } } vma_unlock_anon_vma(vma); - khugepaged_enter_vma_merge(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(vma->vm_mm); return error; } -- cgit v1.2.3 From 3a3c02ecf7f2852f122d6d16fb9b3d9cb0c6f201 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 Oct 2014 14:50:46 -0700 Subject: mm: page-writeback: inline account_page_dirtied() into single caller A follow-up patch would have changed the call signature. To save the trouble, just fold it instead. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: [3.17.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/page-writeback.c | 23 ++++------------------- 2 files changed, 4 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 27eb1bfbe704..b46461116cd2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1235,7 +1235,6 @@ int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); void account_page_dirtied(struct page *page, struct address_space *mapping); -void account_page_writeback(struct page *page); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ff24c9d83112..ff6a5b07211e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2115,23 +2115,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) } EXPORT_SYMBOL(account_page_dirtied); -/* - * Helper function for set_page_writeback family. - * - * The caller must hold mem_cgroup_begin/end_update_page_stat() lock - * while calling this function. - * See test_set_page_writeback for example. - * - * NOTE: Unlike account_page_dirtied this does not rely on being atomic - * wrt interrupts. - */ -void account_page_writeback(struct page *page) -{ - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); - inc_zone_page_state(page, NR_WRITEBACK); -} -EXPORT_SYMBOL(account_page_writeback); - /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. @@ -2410,8 +2393,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } else { ret = TestSetPageWriteback(page); } - if (!ret) - account_page_writeback(page); + if (!ret) { + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + inc_zone_page_state(page, NR_WRITEBACK); + } mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); return ret; -- cgit v1.2.3 From d7365e783edb858279be1d03f61bc8d5d3383d90 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 Oct 2014 14:50:48 -0700 Subject: mm: memcontrol: fix missed end-writeback page accounting Commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed page migration to uncharge the old page right away. The page is locked, unmapped, truncated, and off the LRU, but it could race with writeback ending, which then doesn't unaccount the page properly: test_clear_page_writeback() migration wait_on_page_writeback() TestClearPageWriteback() mem_cgroup_migrate() clear PCG_USED mem_cgroup_update_page_stat() if (PageCgroupUsed(pc)) decrease memcg pages under writeback release pc->mem_cgroup->move_lock The per-page statistics interface is heavily optimized to avoid a function call and a lookup_page_cgroup() in the file unmap fast path, which means it doesn't verify whether a page is still charged before clearing PageWriteback() and it has to do it in the stat update later. Rework it so that it looks up the page's memcg once at the beginning of the transaction and then uses it throughout. The charge will be verified before clearing PageWriteback() and migration can't uncharge the page as long as that is still set. The RCU lock will protect the memcg past uncharge. As far as losing the optimization goes, the following test results are from a microbenchmark that maps, faults, and unmaps a 4GB sparse file three times in a nested fashion, so that there are two negative passes that don't account but still go through the new transaction overhead. There is no actual difference: old: 33.195102545 seconds time elapsed ( +- 0.01% ) new: 33.199231369 seconds time elapsed ( +- 0.03% ) The time spent in page_remove_rmap()'s callees still adds up to the same, but the time spent in the function itself seems reduced: # Children Self Command Shared Object Symbol old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: [3.17.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 58 ++++++++----------------- mm/memcontrol.c | 105 ++++++++++++++++++++++++--------------------- mm/page-writeback.c | 22 +++++----- mm/rmap.c | 20 ++++----- 4 files changed, 96 insertions(+), 109 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 19df5d857411..6b75640ef5ab 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -139,48 +139,23 @@ static inline bool mem_cgroup_disabled(void) return false; } -void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, - unsigned long *flags); - -extern atomic_t memcg_moving; - -static inline void mem_cgroup_begin_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) -{ - if (mem_cgroup_disabled()) - return; - rcu_read_lock(); - *locked = false; - if (atomic_read(&memcg_moving)) - __mem_cgroup_begin_update_page_stat(page, locked, flags); -} - -void __mem_cgroup_end_update_page_stat(struct page *page, - unsigned long *flags); -static inline void mem_cgroup_end_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) -{ - if (mem_cgroup_disabled()) - return; - if (*locked) - __mem_cgroup_end_update_page_stat(page, flags); - rcu_read_unlock(); -} - -void mem_cgroup_update_page_stat(struct page *page, - enum mem_cgroup_stat_index idx, - int val); - -static inline void mem_cgroup_inc_page_stat(struct page *page, +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, + unsigned long *flags); +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, + unsigned long flags); +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, + enum mem_cgroup_stat_index idx, int val); + +static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { - mem_cgroup_update_page_stat(page, idx, 1); + mem_cgroup_update_page_stat(memcg, idx, 1); } -static inline void mem_cgroup_dec_page_stat(struct page *page, +static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { - mem_cgroup_update_page_stat(page, idx, -1); + mem_cgroup_update_page_stat(memcg, idx, -1); } unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, @@ -315,13 +290,14 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void mem_cgroup_begin_update_page_stat(struct page *page, +static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, unsigned long *flags) { + return NULL; } -static inline void mem_cgroup_end_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) +static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, + bool locked, unsigned long flags) { } @@ -343,12 +319,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } -static inline void mem_cgroup_inc_page_stat(struct page *page, +static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { } -static inline void mem_cgroup_dec_page_stat(struct page *page, +static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 23976fd885fd..d6ac0e33e150 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1536,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) * start move here. */ -/* for quick checking without looking up memcg */ -atomic_t memcg_moving __read_mostly; - static void mem_cgroup_start_move(struct mem_cgroup *memcg) { - atomic_inc(&memcg_moving); atomic_inc(&memcg->moving_account); synchronize_rcu(); } @@ -1552,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) * Now, mem_cgroup_clear_mc() may call this function with NULL. * We check NULL in callee rather than caller. */ - if (memcg) { - atomic_dec(&memcg_moving); + if (memcg) atomic_dec(&memcg->moving_account); - } } /* @@ -2204,41 +2198,52 @@ cleanup: return true; } -/* - * Used to update mapped file or writeback or other statistics. +/** + * mem_cgroup_begin_page_stat - begin a page state statistics transaction + * @page: page that is going to change accounted state + * @locked: &memcg->move_lock slowpath was taken + * @flags: IRQ-state flags for &memcg->move_lock * - * Notes: Race condition + * This function must mark the beginning of an accounted page state + * change to prevent double accounting when the page is concurrently + * being moved to another memcg: * - * Charging occurs during page instantiation, while the page is - * unmapped and locked in page migration, or while the page table is - * locked in THP migration. No race is possible. + * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + * if (TestClearPageState(page)) + * mem_cgroup_update_page_stat(memcg, state, -1); + * mem_cgroup_end_page_stat(memcg, locked, flags); * - * Uncharge happens to pages with zero references, no race possible. + * The RCU lock is held throughout the transaction. The fast path can + * get away without acquiring the memcg->move_lock (@locked is false) + * because page moving starts with an RCU grace period. * - * Charge moving between groups is protected by checking mm->moving - * account and taking the move_lock in the slowpath. + * The RCU lock also protects the memcg from being freed when the page + * state that is going to change is the only thing preventing the page + * from being uncharged. E.g. end-writeback clearing PageWriteback(), + * which allows migration to go ahead and uncharge the page before the + * account transaction might be complete. */ - -void __mem_cgroup_begin_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, + bool *locked, + unsigned long *flags) { struct mem_cgroup *memcg; struct page_cgroup *pc; + rcu_read_lock(); + + if (mem_cgroup_disabled()) + return NULL; + pc = lookup_page_cgroup(page); again: memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - /* - * If this memory cgroup is not under account moving, we don't - * need to take move_lock_mem_cgroup(). Because we already hold - * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock(). - */ - VM_BUG_ON(!rcu_read_lock_held()); + return NULL; + + *locked = false; if (atomic_read(&memcg->moving_account) <= 0) - return; + return memcg; move_lock_mem_cgroup(memcg, flags); if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { @@ -2246,36 +2251,40 @@ again: goto again; } *locked = true; + + return memcg; } -void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) +/** + * mem_cgroup_end_page_stat - finish a page state statistics transaction + * @memcg: the memcg that was accounted against + * @locked: value received from mem_cgroup_begin_page_stat() + * @flags: value received from mem_cgroup_begin_page_stat() + */ +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, + unsigned long flags) { - struct page_cgroup *pc = lookup_page_cgroup(page); + if (memcg && locked) + move_unlock_mem_cgroup(memcg, &flags); - /* - * It's guaranteed that pc->mem_cgroup never changes while - * lock is held because a routine modifies pc->mem_cgroup - * should take move_lock_mem_cgroup(). - */ - move_unlock_mem_cgroup(pc->mem_cgroup, flags); + rcu_read_unlock(); } -void mem_cgroup_update_page_stat(struct page *page, +/** + * mem_cgroup_update_page_stat - update page state statistics + * @memcg: memcg to account against + * @idx: page state item to account + * @val: number of pages (positive or negative) + * + * See mem_cgroup_begin_page_stat() for locking requirements. + */ +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx, int val) { - struct mem_cgroup *memcg; - struct page_cgroup *pc = lookup_page_cgroup(page); - unsigned long uninitialized_var(flags); - - if (mem_cgroup_disabled()) - return; - VM_BUG_ON(!rcu_read_lock_held()); - memcg = pc->mem_cgroup; - if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - this_cpu_add(memcg->stat->count[idx], val); + if (memcg) + this_cpu_add(memcg->stat->count[idx], val); } /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ff6a5b07211e..19ceae87522d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2327,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - int ret; - bool locked; unsigned long memcg_flags; + struct mem_cgroup *memcg; + bool locked; + int ret; - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2352,22 +2353,23 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - int ret; - bool locked; unsigned long memcg_flags; + struct mem_cgroup *memcg; + bool locked; + int ret; - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2394,10 +2396,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 116a5053415b..f574046f77d4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1042,15 +1042,16 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - bool locked; + struct mem_cgroup *memcg; unsigned long flags; + bool locked; - mem_cgroup_begin_update_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_end_page_stat(memcg, locked, flags); } /** @@ -1061,9 +1062,10 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { + struct mem_cgroup *uninitialized_var(memcg); bool anon = PageAnon(page); - bool locked; unsigned long flags; + bool locked; /* * The anon case has no mem_cgroup page_stat to update; but may @@ -1071,7 +1073,7 @@ void page_remove_rmap(struct page *page) * we hold the lock against page_stat move: so avoid it on anon. */ if (!anon) - mem_cgroup_begin_update_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) @@ -1096,8 +1098,7 @@ void page_remove_rmap(struct page *page) -hpage_nr_pages(page)); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1110,10 +1111,9 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ - return; out: if (!anon) - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_end_page_stat(memcg, locked, flags); } /* -- cgit v1.2.3 From 39bb5e62867de82b269b07df900165029b928359 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Oct 2014 10:32:34 -0700 Subject: net: skb_fclone_busy() needs to detect orphaned skb Some drivers are unable to perform TX completions in a bound time. They instead call skb_orphan() Problem is skb_fclone_busy() has to detect this case, otherwise we block TCP retransmits and can freeze unlucky tcp sessions on mostly idle hosts. Signed-off-by: Eric Dumazet Fixes: 1f3279ae0c13 ("tcp: avoid retransmits of TCP packets hanging in host queues") Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 ++++++-- net/ipv4/tcp_output.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5884f95ff0e9..6c8b6f604e76 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -799,15 +799,19 @@ struct sk_buff_fclones { * @skb: buffer * * Returns true is skb is a fast clone, and its clone is not freed. + * Some drivers call skb_orphan() in their ndo_start_xmit(), + * so we also check that this didnt happen. */ -static inline bool skb_fclone_busy(const struct sk_buff *skb) +static inline bool skb_fclone_busy(const struct sock *sk, + const struct sk_buff *skb) { const struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); return skb->fclone == SKB_FCLONE_ORIG && - fclones->skb2.fclone == SKB_FCLONE_CLONE; + fclones->skb2.fclone == SKB_FCLONE_CLONE && + fclones->skb2.sk == sk; } static inline struct sk_buff *alloc_skb_fclone(unsigned int size, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3af21296d967..a3d453b94747 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2126,7 +2126,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { - if (unlikely(skb_fclone_busy(skb))) { + if (unlikely(skb_fclone_busy(sk, skb))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4c4e457e7888..88bf289abdc9 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1962,7 +1962,7 @@ static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; - if (unlikely(skb_fclone_busy(skb))) { + if (unlikely(skb_fclone_busy(sk, skb))) { kfree_skb(skb); return 0; } -- cgit v1.2.3 From b2de525f095708b2adbadaec3f1e4017a23d1e09 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Mon, 29 Sep 2014 10:21:10 -0400 Subject: Return short read or 0 at end of a raw device, not EIO Author: David Jeffery Changes to the basic direct I/O code have broken the raw driver when reading to the end of a raw device. Instead of returning a short read for a read that extends partially beyond the device's end or 0 when at the end of the device, these reads now return EIO. The raw driver needs the same end of device handling as was added for normal block devices. Using blkdev_read_iter, which has the needed size checks, prevents the EIO conditions at the end of the device. Signed-off-by: David Jeffery Signed-off-by: Al Viro --- drivers/char/raw.c | 2 +- fs/block_dev.c | 3 ++- include/linux/fs.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 0102dc788608..a24891b97547 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -285,7 +285,7 @@ static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd, static const struct file_operations raw_fops = { .read = new_sync_read, - .read_iter = generic_file_read_iter, + .read_iter = blkdev_read_iter, .write = new_sync_write, .write_iter = blkdev_write_iter, .fsync = blkdev_fsync, diff --git a/fs/block_dev.c b/fs/block_dev.c index cc9d4114cda0..1d9c9f3754f8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1585,7 +1585,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) } EXPORT_SYMBOL_GPL(blkdev_write_iter); -static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) +ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *bd_inode = file->f_mapping->host; @@ -1599,6 +1599,7 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) iov_iter_truncate(to, size); return generic_file_read_iter(iocb, to); } +EXPORT_SYMBOL_GPL(blkdev_read_iter); /* * Try to release a page associated with block device when the system diff --git a/include/linux/fs.h b/include/linux/fs.h index 01036262095f..9ab779e8a63c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2469,6 +2469,7 @@ extern ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo extern ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); /* fs/block_dev.c */ +extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to); extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from); extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync); -- cgit v1.2.3 From a87fa1d81a9fb5e9adca9820e16008c40ad09f33 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 3 Nov 2014 15:15:35 +0000 Subject: of: Fix overflow bug in string property parsing functions The string property read helpers will run off the end of the buffer if it is handed a malformed string property. Rework the parsers to make sure that doesn't happen. At the same time add new test cases to make sure the functions behave themselves. The original implementations of of_property_read_string_index() and of_property_count_strings() both open-coded the same block of parsing code, each with it's own subtly different bugs. The fix here merges functions into a single helper and makes the original functions static inline wrappers around the helper. One non-bugfix aspect of this patch is the addition of a new wrapper, of_property_read_string_array(). The new wrapper is needed by the device_properties feature that Rafael is working on and planning to merge for v3.19. The implementation is identical both with and without the new static inline wrapper, so it just got left in to reduce the churn on the header file. Signed-off-by: Grant Likely Cc: Rafael J. Wysocki Cc: Mika Westerberg Cc: Rob Herring Cc: Arnd Bergmann Cc: Darren Hart Cc: # v3.3+: Drop selftest hunks that don't apply --- drivers/of/base.c | 88 ++++++++--------------------- drivers/of/selftest.c | 66 ++++++++++++++++++++-- drivers/of/testcase-data/tests-phandle.dtsi | 2 + include/linux/of.h | 84 ++++++++++++++++++++++----- 4 files changed, 154 insertions(+), 86 deletions(-) (limited to 'include/linux') diff --git a/drivers/of/base.c b/drivers/of/base.c index 2305dc0382bc..3823edf2d012 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -1279,52 +1279,6 @@ int of_property_read_string(struct device_node *np, const char *propname, } EXPORT_SYMBOL_GPL(of_property_read_string); -/** - * of_property_read_string_index - Find and read a string from a multiple - * strings property. - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @index: index of the string in the list of strings - * @out_string: pointer to null terminated return string, modified only if - * return value is 0. - * - * Search for a property in a device tree node and retrieve a null - * terminated string value (pointer to data, not a copy) in the list of strings - * contained in that property. - * Returns 0 on success, -EINVAL if the property does not exist, -ENODATA if - * property does not have a value, and -EILSEQ if the string is not - * null-terminated within the length of the property data. - * - * The out_string pointer is modified only if a valid string can be decoded. - */ -int of_property_read_string_index(struct device_node *np, const char *propname, - int index, const char **output) -{ - struct property *prop = of_find_property(np, propname, NULL); - int i = 0; - size_t l = 0, total = 0; - const char *p; - - if (!prop) - return -EINVAL; - if (!prop->value) - return -ENODATA; - if (strnlen(prop->value, prop->length) >= prop->length) - return -EILSEQ; - - p = prop->value; - - for (i = 0; total < prop->length; total += l, p += l) { - l = strlen(p) + 1; - if (i++ == index) { - *output = p; - return 0; - } - } - return -ENODATA; -} -EXPORT_SYMBOL_GPL(of_property_read_string_index); - /** * of_property_match_string() - Find string in a list and return index * @np: pointer to node containing string list property @@ -1351,7 +1305,7 @@ int of_property_match_string(struct device_node *np, const char *propname, end = p + prop->length; for (i = 0; p < end; i++, p += l) { - l = strlen(p) + 1; + l = strnlen(p, end - p) + 1; if (p + l > end) return -EILSEQ; pr_debug("comparing %s with %s\n", string, p); @@ -1363,39 +1317,41 @@ int of_property_match_string(struct device_node *np, const char *propname, EXPORT_SYMBOL_GPL(of_property_match_string); /** - * of_property_count_strings - Find and return the number of strings from a - * multiple strings property. + * of_property_read_string_util() - Utility helper for parsing string properties * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. + * @out_strs: output array of string pointers. + * @sz: number of array elements to read. + * @skip: Number of strings to skip over at beginning of list. * - * Search for a property in a device tree node and retrieve the number of null - * terminated string contain in it. Returns the number of strings on - * success, -EINVAL if the property does not exist, -ENODATA if property - * does not have a value, and -EILSEQ if the string is not null-terminated - * within the length of the property data. + * Don't call this function directly. It is a utility helper for the + * of_property_read_string*() family of functions. */ -int of_property_count_strings(struct device_node *np, const char *propname) +int of_property_read_string_helper(struct device_node *np, const char *propname, + const char **out_strs, size_t sz, int skip) { struct property *prop = of_find_property(np, propname, NULL); - int i = 0; - size_t l = 0, total = 0; - const char *p; + int l = 0, i = 0; + const char *p, *end; if (!prop) return -EINVAL; if (!prop->value) return -ENODATA; - if (strnlen(prop->value, prop->length) >= prop->length) - return -EILSEQ; - p = prop->value; + end = p + prop->length; - for (i = 0; total < prop->length; total += l, p += l, i++) - l = strlen(p) + 1; - - return i; + for (i = 0; p < end && (!out_strs || i < skip + sz); i++, p += l) { + l = strnlen(p, end - p) + 1; + if (p + l > end) + return -EILSEQ; + if (out_strs && i >= skip) + *out_strs++ = p; + } + i -= skip; + return i <= 0 ? -ENODATA : i; } -EXPORT_SYMBOL_GPL(of_property_count_strings); +EXPORT_SYMBOL_GPL(of_property_read_string_helper); void of_print_phandle_args(const char *msg, const struct of_phandle_args *args) { diff --git a/drivers/of/selftest.c b/drivers/of/selftest.c index 78001270a598..11b873c54a77 100644 --- a/drivers/of/selftest.c +++ b/drivers/of/selftest.c @@ -339,8 +339,9 @@ static void __init of_selftest_parse_phandle_with_args(void) selftest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc); } -static void __init of_selftest_property_match_string(void) +static void __init of_selftest_property_string(void) { + const char *strings[4]; struct device_node *np; int rc; @@ -357,13 +358,66 @@ static void __init of_selftest_property_match_string(void) rc = of_property_match_string(np, "phandle-list-names", "third"); selftest(rc == 2, "third expected:0 got:%i\n", rc); rc = of_property_match_string(np, "phandle-list-names", "fourth"); - selftest(rc == -ENODATA, "unmatched string; rc=%i", rc); + selftest(rc == -ENODATA, "unmatched string; rc=%i\n", rc); rc = of_property_match_string(np, "missing-property", "blah"); - selftest(rc == -EINVAL, "missing property; rc=%i", rc); + selftest(rc == -EINVAL, "missing property; rc=%i\n", rc); rc = of_property_match_string(np, "empty-property", "blah"); - selftest(rc == -ENODATA, "empty property; rc=%i", rc); + selftest(rc == -ENODATA, "empty property; rc=%i\n", rc); rc = of_property_match_string(np, "unterminated-string", "blah"); - selftest(rc == -EILSEQ, "unterminated string; rc=%i", rc); + selftest(rc == -EILSEQ, "unterminated string; rc=%i\n", rc); + + /* of_property_count_strings() tests */ + rc = of_property_count_strings(np, "string-property"); + selftest(rc == 1, "Incorrect string count; rc=%i\n", rc); + rc = of_property_count_strings(np, "phandle-list-names"); + selftest(rc == 3, "Incorrect string count; rc=%i\n", rc); + rc = of_property_count_strings(np, "unterminated-string"); + selftest(rc == -EILSEQ, "unterminated string; rc=%i\n", rc); + rc = of_property_count_strings(np, "unterminated-string-list"); + selftest(rc == -EILSEQ, "unterminated string array; rc=%i\n", rc); + + /* of_property_read_string_index() tests */ + rc = of_property_read_string_index(np, "string-property", 0, strings); + selftest(rc == 0 && !strcmp(strings[0], "foobar"), "of_property_read_string_index() failure; rc=%i\n", rc); + strings[0] = NULL; + rc = of_property_read_string_index(np, "string-property", 1, strings); + selftest(rc == -ENODATA && strings[0] == NULL, "of_property_read_string_index() failure; rc=%i\n", rc); + rc = of_property_read_string_index(np, "phandle-list-names", 0, strings); + selftest(rc == 0 && !strcmp(strings[0], "first"), "of_property_read_string_index() failure; rc=%i\n", rc); + rc = of_property_read_string_index(np, "phandle-list-names", 1, strings); + selftest(rc == 0 && !strcmp(strings[0], "second"), "of_property_read_string_index() failure; rc=%i\n", rc); + rc = of_property_read_string_index(np, "phandle-list-names", 2, strings); + selftest(rc == 0 && !strcmp(strings[0], "third"), "of_property_read_string_index() failure; rc=%i\n", rc); + strings[0] = NULL; + rc = of_property_read_string_index(np, "phandle-list-names", 3, strings); + selftest(rc == -ENODATA && strings[0] == NULL, "of_property_read_string_index() failure; rc=%i\n", rc); + strings[0] = NULL; + rc = of_property_read_string_index(np, "unterminated-string", 0, strings); + selftest(rc == -EILSEQ && strings[0] == NULL, "of_property_read_string_index() failure; rc=%i\n", rc); + rc = of_property_read_string_index(np, "unterminated-string-list", 0, strings); + selftest(rc == 0 && !strcmp(strings[0], "first"), "of_property_read_string_index() failure; rc=%i\n", rc); + strings[0] = NULL; + rc = of_property_read_string_index(np, "unterminated-string-list", 2, strings); /* should fail */ + selftest(rc == -EILSEQ && strings[0] == NULL, "of_property_read_string_index() failure; rc=%i\n", rc); + strings[1] = NULL; + + /* of_property_read_string_array() tests */ + rc = of_property_read_string_array(np, "string-property", strings, 4); + selftest(rc == 1, "Incorrect string count; rc=%i\n", rc); + rc = of_property_read_string_array(np, "phandle-list-names", strings, 4); + selftest(rc == 3, "Incorrect string count; rc=%i\n", rc); + rc = of_property_read_string_array(np, "unterminated-string", strings, 4); + selftest(rc == -EILSEQ, "unterminated string; rc=%i\n", rc); + /* -- An incorrectly formed string should cause a failure */ + rc = of_property_read_string_array(np, "unterminated-string-list", strings, 4); + selftest(rc == -EILSEQ, "unterminated string array; rc=%i\n", rc); + /* -- parsing the correctly formed strings should still work: */ + strings[2] = NULL; + rc = of_property_read_string_array(np, "unterminated-string-list", strings, 2); + selftest(rc == 2 && strings[2] == NULL, "of_property_read_string_array() failure; rc=%i\n", rc); + strings[1] = NULL; + rc = of_property_read_string_array(np, "phandle-list-names", strings, 1); + selftest(rc == 1 && strings[1] == NULL, "Overwrote end of string array; rc=%i, str='%s'\n", rc, strings[1]); } #define propcmp(p1, p2) (((p1)->length == (p2)->length) && \ @@ -881,7 +935,7 @@ static int __init of_selftest(void) of_selftest_find_node_by_name(); of_selftest_dynamic(); of_selftest_parse_phandle_with_args(); - of_selftest_property_match_string(); + of_selftest_property_string(); of_selftest_property_copy(); of_selftest_changeset(); of_selftest_parse_interrupts(); diff --git a/drivers/of/testcase-data/tests-phandle.dtsi b/drivers/of/testcase-data/tests-phandle.dtsi index ce0fe083d406..5b1527e8a7fb 100644 --- a/drivers/of/testcase-data/tests-phandle.dtsi +++ b/drivers/of/testcase-data/tests-phandle.dtsi @@ -39,7 +39,9 @@ phandle-list-bad-args = <&provider2 1 0>, <&provider3 0>; empty-property; + string-property = "foobar"; unterminated-string = [40 41 42 43]; + unterminated-string-list = "first", "second", [40 41 42 43]; }; }; }; diff --git a/include/linux/of.h b/include/linux/of.h index 6545e7aec7bb..29f0adc5f3e4 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -267,14 +267,12 @@ extern int of_property_read_u64(const struct device_node *np, extern int of_property_read_string(struct device_node *np, const char *propname, const char **out_string); -extern int of_property_read_string_index(struct device_node *np, - const char *propname, - int index, const char **output); extern int of_property_match_string(struct device_node *np, const char *propname, const char *string); -extern int of_property_count_strings(struct device_node *np, - const char *propname); +extern int of_property_read_string_helper(struct device_node *np, + const char *propname, + const char **out_strs, size_t sz, int index); extern int of_device_is_compatible(const struct device_node *device, const char *); extern int of_device_is_available(const struct device_node *device); @@ -486,15 +484,9 @@ static inline int of_property_read_string(struct device_node *np, return -ENOSYS; } -static inline int of_property_read_string_index(struct device_node *np, - const char *propname, int index, - const char **out_string) -{ - return -ENOSYS; -} - -static inline int of_property_count_strings(struct device_node *np, - const char *propname) +static inline int of_property_read_string_helper(struct device_node *np, + const char *propname, + const char **out_strs, size_t sz, int index) { return -ENOSYS; } @@ -667,6 +659,70 @@ static inline int of_property_count_u64_elems(const struct device_node *np, return of_property_count_elems_of_size(np, propname, sizeof(u64)); } +/** + * of_property_read_string_array() - Read an array of strings from a multiple + * strings property. + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_strs: output array of string pointers. + * @sz: number of array elements to read. + * + * Search for a property in a device tree node and retrieve a list of + * terminated string values (pointer to data, not a copy) in that property. + * + * If @out_strs is NULL, the number of strings in the property is returned. + */ +static inline int of_property_read_string_array(struct device_node *np, + const char *propname, const char **out_strs, + size_t sz) +{ + return of_property_read_string_helper(np, propname, out_strs, sz, 0); +} + +/** + * of_property_count_strings() - Find and return the number of strings from a + * multiple strings property. + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * + * Search for a property in a device tree node and retrieve the number of null + * terminated string contain in it. Returns the number of strings on + * success, -EINVAL if the property does not exist, -ENODATA if property + * does not have a value, and -EILSEQ if the string is not null-terminated + * within the length of the property data. + */ +static inline int of_property_count_strings(struct device_node *np, + const char *propname) +{ + return of_property_read_string_helper(np, propname, NULL, 0, 0); +} + +/** + * of_property_read_string_index() - Find and read a string from a multiple + * strings property. + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @index: index of the string in the list of strings + * @out_string: pointer to null terminated return string, modified only if + * return value is 0. + * + * Search for a property in a device tree node and retrieve a null + * terminated string value (pointer to data, not a copy) in the list of strings + * contained in that property. + * Returns 0 on success, -EINVAL if the property does not exist, -ENODATA if + * property does not have a value, and -EILSEQ if the string is not + * null-terminated within the length of the property data. + * + * The out_string pointer is modified only if a valid string can be decoded. + */ +static inline int of_property_read_string_index(struct device_node *np, + const char *propname, + int index, const char **output) +{ + int rc = of_property_read_string_helper(np, propname, output, 1, index); + return rc < 0 ? rc : 0; +} + /** * of_property_read_bool - Findfrom a property * @np: device node from which the property value is to be read. -- cgit v1.2.3 From 32f638fc11db0526c706454d9ab4339d55ac89f3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 30 Oct 2014 10:17:25 -0600 Subject: PCI: Don't oops on virtual buses in acpi_pci_get_bridge_handle() acpi_pci_get_bridge_handle() returns the ACPI handle for the bridge device (either a host bridge or a PCI-to-PCI bridge) leading to a PCI bus. But SR-IOV virtual functions can be on a virtual bus with no bridge leading to it. Return a NULL acpi_handle in this case instead of trying to dereference the NULL pointer to the bridge. This fixes a NULL pointer dereference oops in pci_get_hp_params() when adding SR-IOV VF devices on virtual buses. [bhelgaas: changelog, add comment in code] Fixes: 6cd33649fa83 ("PCI: Add pci_configure_device() during enumeration") Link: https://bugzilla.kernel.org/show_bug.cgi?id=87591 Reported-by: Chao Zhou Reported-by: Joerg Roedel Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- include/linux/pci-acpi.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 64dacb7288a6..24c7728ca681 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -41,8 +41,13 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus) if (pci_is_root_bus(pbus)) dev = pbus->bridge; - else + else { + /* If pbus is a virtual bus, there is no bridge to it */ + if (!pbus->self) + return NULL; + dev = &pbus->self->dev; + } return ACPI_HANDLE(dev); } -- cgit v1.2.3 From 9cdb5dbf79f4e8f43e19ab7f4ec9ed74c146f0af Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 5 Nov 2014 21:44:27 +0100 Subject: include/linux/socket.h: Fix comment File descriptors are always closed on exit :-) Signed-off-by: Rasmus Villemoes Signed-off-by: David S. Miller --- include/linux/socket.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index ec538fc287a6..bb9b83640070 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -256,7 +256,7 @@ struct ucred { #define MSG_EOF MSG_FIN #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ -#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exit for file +#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through SCM_RIGHTS */ #if defined(CONFIG_COMPAT) -- cgit v1.2.3 From c16561e8df7a64764ef61f02221e98273add325a Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Thu, 6 Nov 2014 00:37:08 +0100 Subject: PM / Domains: Change prototype for the attach and detach callbacks Convert the prototypes to return an int in order to support error handling in these callbacks. Also, as suggested by Dmitry Torokhov, pass the domain pointer for use inside the callbacks, and so that they match the existing power_on/power_off callbacks which currently take the domain pointer. Acked-by: Dmitry Torokhov Acked-by: Geert Uytterhoeven Signed-off-by: Ulf Hansson [ khilman: added domain as parameter to callbacks, as suggested by Dmitry ] Signed-off-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 4 ++-- include/linux/pm_domain.h | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 40bc2f4072cc..b520687046d4 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -1437,7 +1437,7 @@ int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev, spin_unlock_irq(&dev->power.lock); if (genpd->attach_dev) - genpd->attach_dev(dev); + genpd->attach_dev(genpd, dev); mutex_lock(&gpd_data->lock); gpd_data->base.dev = dev; @@ -1499,7 +1499,7 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd, genpd->max_off_time_changed = true; if (genpd->detach_dev) - genpd->detach_dev(dev); + genpd->detach_dev(genpd, dev); spin_lock_irq(&dev->power.lock); diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 73e938b7e937..b3ed7766a291 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -72,8 +72,10 @@ struct generic_pm_domain { bool max_off_time_changed; bool cached_power_down_ok; struct gpd_cpuidle_data *cpuidle_data; - void (*attach_dev)(struct device *dev); - void (*detach_dev)(struct device *dev); + int (*attach_dev)(struct generic_pm_domain *domain, + struct device *dev); + void (*detach_dev)(struct generic_pm_domain *domain, + struct device *dev); }; static inline struct generic_pm_domain *pd_to_genpd(struct dev_pm_domain *pd) -- cgit v1.2.3 From c0acb8144bd6d8d88aee1dab33364b7353e9a903 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 10 Oct 2014 12:48:35 +0200 Subject: mfd: max77693: Fix always masked MUIC interrupts All interrupts coming from MUIC were ignored because interrupt source register was masked. The Maxim 77693 has a "interrupt source" - a separate register and interrupts which give information about PMIC block triggering the individual interrupt (charger, topsys, MUIC, flash LED). By default bootloader could initialize this register to "mask all" value. In such case (observed on Trats2 board) MUIC interrupts won't be generated regardless of their mask status. Regmap irq chip was unmasking individual MUIC interrupts but the source was masked Before introducing regmap irq chip this interrupt source was unmasked, read and acked. Reading and acking is not necessary but unmasking is. Fixes: 342d669c1ee4 ("mfd: max77693: Handle IRQs using regmap") Cc: Signed-off-by: Krzysztof Kozlowski Reviewed-by: Chanwoo Choi Signed-off-by: Lee Jones --- drivers/mfd/max77693.c | 12 ++++++++++++ include/linux/mfd/max77693-private.h | 7 +++++++ 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/drivers/mfd/max77693.c b/drivers/mfd/max77693.c index 22d23fcbb65c..711773e8e64b 100644 --- a/drivers/mfd/max77693.c +++ b/drivers/mfd/max77693.c @@ -250,6 +250,17 @@ static int max77693_i2c_probe(struct i2c_client *i2c, goto err_irq_muic; } + /* Unmask interrupts from all blocks in interrupt source register */ + ret = regmap_update_bits(max77693->regmap, + MAX77693_PMIC_REG_INTSRC_MASK, + SRC_IRQ_ALL, (unsigned int)~SRC_IRQ_ALL); + if (ret < 0) { + dev_err(max77693->dev, + "Could not unmask interrupts in INTSRC: %d\n", + ret); + goto err_intsrc; + } + pm_runtime_set_active(max77693->dev); ret = mfd_add_devices(max77693->dev, -1, max77693_devs, @@ -261,6 +272,7 @@ static int max77693_i2c_probe(struct i2c_client *i2c, err_mfd: mfd_remove_devices(max77693->dev); +err_intsrc: regmap_del_irq_chip(max77693->irq, max77693->irq_data_muic); err_irq_muic: regmap_del_irq_chip(max77693->irq, max77693->irq_data_charger); diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h index fc17d56581b2..582e67f34054 100644 --- a/include/linux/mfd/max77693-private.h +++ b/include/linux/mfd/max77693-private.h @@ -330,6 +330,13 @@ enum max77693_irq_source { MAX77693_IRQ_GROUP_NR, }; +#define SRC_IRQ_CHARGER BIT(0) +#define SRC_IRQ_TOP BIT(1) +#define SRC_IRQ_FLASH BIT(2) +#define SRC_IRQ_MUIC BIT(3) +#define SRC_IRQ_ALL (SRC_IRQ_CHARGER | SRC_IRQ_TOP \ + | SRC_IRQ_FLASH | SRC_IRQ_MUIC) + #define LED_IRQ_FLED2_OPEN BIT(0) #define LED_IRQ_FLED2_SHORT BIT(1) #define LED_IRQ_FLED1_OPEN BIT(2) -- cgit v1.2.3 From e30f53aad2202b5526c40c36d8eeac8bf290bde5 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Mon, 10 Nov 2014 19:46:34 +0100 Subject: tracing: Do not busy wait in buffer splice On a !PREEMPT kernel, attempting to use trace-cmd results in a soft lockup: # trace-cmd record -e raw_syscalls:* -F false NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [trace-cmd:61] ... Call Trace: [] ? __wake_up_common+0x90/0x90 [] wait_on_pipe+0x35/0x40 [] tracing_buffers_splice_read+0x2e3/0x3c0 [] ? tracing_stats_read+0x2a0/0x2a0 [] ? _raw_spin_unlock+0x2b/0x40 [] ? do_read_fault+0x21b/0x290 [] ? handle_mm_fault+0x2ba/0xbd0 [] ? trace_event_buffer_lock_reserve+0x40/0x80 [] ? trace_buffer_lock_reserve+0x22/0x60 [] ? trace_event_buffer_lock_reserve+0x40/0x80 [] do_splice_to+0x6d/0x90 [] SyS_splice+0x7c1/0x800 [] tracesys_phase2+0xd3/0xd8 The problem is this: tracing_buffers_splice_read() calls ring_buffer_wait() to wait for data in the ring buffers. The buffers are not empty so ring_buffer_wait() returns immediately. But tracing_buffers_splice_read() calls ring_buffer_read_page() with full=1, meaning it only wants to read a full page. When the full page is not available, tracing_buffers_splice_read() tries to wait again with ring_buffer_wait(), which again returns immediately, and so on. Fix this by adding a "full" argument to ring_buffer_wait() which will make ring_buffer_wait() wait until the writer has left the reader's page, i.e. until full-page reads will succeed. Link: http://lkml.kernel.org/r/1415645194-25379-1-git-send-email-rabin@rab.in Cc: stable@vger.kernel.org # 3.16+ Fixes: b1169cc69ba9 ("tracing: Remove mock up poll wait function") Signed-off-by: Rabin Vincent Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 81 ++++++++++++++++++++++++++++++--------------- kernel/trace/trace.c | 23 ++++--------- 3 files changed, 62 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 49a4d6f59108..e2c13cd863bd 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -97,7 +97,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k __ring_buffer_alloc((size), (flags), &__key); \ }) -int ring_buffer_wait(struct ring_buffer *buffer, int cpu); +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full); int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2d75c94ae87d..a56e07c8d15b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -538,16 +538,18 @@ static void rb_wake_up_waiters(struct irq_work *work) * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on + * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS * * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. */ -int ring_buffer_wait(struct ring_buffer *buffer, int cpu) +int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) { - struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer); DEFINE_WAIT(wait); struct rb_irq_work *work; + int ret = 0; /* * Depending on what the caller is waiting for, either any @@ -564,36 +566,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu) } - prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + while (true) { + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); - /* - * The events can happen in critical sections where - * checking a work queue can cause deadlocks. - * After adding a task to the queue, this flag is set - * only to notify events to try to wake up the queue - * using irq_work. - * - * We don't clear it even if the buffer is no longer - * empty. The flag only causes the next event to run - * irq_work to do the work queue wake up. The worse - * that can happen if we race with !trace_empty() is that - * an event will cause an irq_work to try to wake up - * an empty queue. - * - * There's no reason to protect this flag either, as - * the work queue and irq_work logic will do the necessary - * synchronization for the wake ups. The only thing - * that is necessary is that the wake up happens after - * a task has been queued. It's OK for spurious wake ups. - */ - work->waiters_pending = true; + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + work->waiters_pending = true; + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) + break; + + if (cpu != RING_BUFFER_ALL_CPUS && + !ring_buffer_empty_cpu(buffer, cpu)) { + unsigned long flags; + bool pagebusy; + + if (!full) + break; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + if (!pagebusy) + break; + } - if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || - (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) schedule(); + } finish_wait(&work->waiters, &wait); - return 0; + + return ret; } /** diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a528392b1f4..15209335888d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1076,13 +1076,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) } #endif /* CONFIG_TRACER_MAX_TRACE */ -static int wait_on_pipe(struct trace_iterator *iter) +static int wait_on_pipe(struct trace_iterator *iter, bool full) { /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); + return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file, + full); } #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -4434,15 +4435,12 @@ static int tracing_wait_pipe(struct file *filp) mutex_unlock(&iter->mutex); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, false); mutex_lock(&iter->mutex); if (ret) return ret; - - if (signal_pending(current)) - return -EINTR; } return 1; @@ -5372,16 +5370,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, goto out_unlock; } mutex_unlock(&trace_types_lock); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, false); mutex_lock(&trace_types_lock); if (ret) { size = ret; goto out_unlock; } - if (signal_pending(current)) { - size = -EINTR; - goto out_unlock; - } goto again; } size = 0; @@ -5587,14 +5581,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, goto out; } mutex_unlock(&trace_types_lock); - ret = wait_on_pipe(iter); + ret = wait_on_pipe(iter, true); mutex_lock(&trace_types_lock); if (ret) goto out; - if (signal_pending(current)) { - ret = -EINTR; - goto out; - } + goto again; } -- cgit v1.2.3 From 67732cd34382066ae5df313b6dad65ab14b9735f Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 11 Nov 2014 11:07:08 +0100 Subject: PM / Domains: Fix initial default state of the need_restore flag The initial state of the device's need_restore flag should'nt depend on the current state of the PM domain. For example it should be perfectly valid to attach an inactive device to a powered PM domain. The pm_genpd_dev_need_restore() API allow us to update the need_restore flag to somewhat cope with such scenarios. Typically that should have been done from drivers/buses ->probe() since it's those that put the requirements on the value of the need_restore flag. Until recently, the Exynos SOCs were the only user of the pm_genpd_dev_need_restore() API, though invoking it from a centralized location while adding devices to their PM domains. Due to that Exynos now have swithed to the generic OF-based PM domain look-up, it's no longer possible to invoke the API from a centralized location. The reason is because devices are now added to their PM domains during the probe sequence. Commit "ARM: exynos: Move to generic PM domain DT bindings" did the switch for Exynos to the generic OF-based PM domain look-up, but it also removed the call to pm_genpd_dev_need_restore(). This caused a regression for some of the Exynos drivers. To handle things more properly in the generic PM domain, let's change the default initial value of the need_restore flag to reflect that the state is unknown. As soon as some of the runtime PM callbacks gets invoked, update the initial value accordingly. Moreover, since the generic PM domain is verifying that all devices are both runtime PM enabled and suspended, using pm_runtime_suspended() while pm_genpd_poweroff() is invoked from the scheduled work, we can be sure of that the PM domain won't be powering off while having active devices. Do note that, the generic PM domain can still only know about active devices which has been activated through invoking its runtime PM resume callback. In other words, buses/drivers using pm_runtime_set_active() during ->probe() will still suffer from a race condition, potentially probing a device without having its PM domain being powered. That issue will have to be solved using a different approach. This a log from the boot regression for Exynos5, which is being fixed in this patch. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 308 at ../drivers/clk/clk.c:851 clk_disable+0x24/0x30() Modules linked in: CPU: 0 PID: 308 Comm: kworker/0:1 Not tainted 3.18.0-rc3-00569-gbd9449f-dirty #10 Workqueue: pm pm_runtime_work [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x70/0xbc) [] (dump_stack) from [] (warn_slowpath_common+0x64/0x88) [] (warn_slowpath_common) from [] (warn_slowpath_null+0x1c/0x24) [] (warn_slowpath_null) from [] (clk_disable+0x24/0x30) [] (clk_disable) from [] (gsc_runtime_suspend+0x128/0x160) [] (gsc_runtime_suspend) from [] (pm_generic_runtime_suspend+0x2c/0x38) [] (pm_generic_runtime_suspend) from [] (pm_genpd_default_save_state+0x2c/0x8c) [] (pm_genpd_default_save_state) from [] (pm_genpd_poweroff+0x224/0x3ec) [] (pm_genpd_poweroff) from [] (pm_genpd_runtime_suspend+0x9c/0xcc) [] (pm_genpd_runtime_suspend) from [] (__rpm_callback+0x2c/0x60) [] (__rpm_callback) from [] (rpm_callback+0x20/0x74) [] (rpm_callback) from [] (rpm_suspend+0xd4/0x43c) [] (rpm_suspend) from [] (pm_runtime_work+0x80/0x90) [] (pm_runtime_work) from [] (process_one_work+0x12c/0x314) [] (process_one_work) from [] (worker_thread+0x3c/0x4b0) [] (worker_thread) from [] (kthread+0xcc/0xe8) [] (kthread) from [] (ret_from_fork+0x14/0x3c) ---[ end trace 40cd58bcd6988f12 ]--- Fixes: a4a8c2c4962bb655 (ARM: exynos: Move to generic PM domain DT bindings) Reported-and-tested0by: Sylwester Nawrocki Reviewed-by: Sylwester Nawrocki Reviewed-by: Kevin Hilman Signed-off-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- drivers/base/power/domain.c | 38 ++++++++++++++++++++++++++++++++------ include/linux/pm_domain.h | 2 +- 2 files changed, 33 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index b520687046d4..fb83d4acd400 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -361,9 +361,19 @@ static int __pm_genpd_save_device(struct pm_domain_data *pdd, struct device *dev = pdd->dev; int ret = 0; - if (gpd_data->need_restore) + if (gpd_data->need_restore > 0) return 0; + /* + * If the value of the need_restore flag is still unknown at this point, + * we trust that pm_genpd_poweroff() has verified that the device is + * already runtime PM suspended. + */ + if (gpd_data->need_restore < 0) { + gpd_data->need_restore = 1; + return 0; + } + mutex_unlock(&genpd->lock); genpd_start_dev(genpd, dev); @@ -373,7 +383,7 @@ static int __pm_genpd_save_device(struct pm_domain_data *pdd, mutex_lock(&genpd->lock); if (!ret) - gpd_data->need_restore = true; + gpd_data->need_restore = 1; return ret; } @@ -389,12 +399,17 @@ static void __pm_genpd_restore_device(struct pm_domain_data *pdd, { struct generic_pm_domain_data *gpd_data = to_gpd_data(pdd); struct device *dev = pdd->dev; - bool need_restore = gpd_data->need_restore; + int need_restore = gpd_data->need_restore; - gpd_data->need_restore = false; + gpd_data->need_restore = 0; mutex_unlock(&genpd->lock); genpd_start_dev(genpd, dev); + + /* + * Call genpd_restore_dev() for recently added devices too (need_restore + * is negative then). + */ if (need_restore) genpd_restore_dev(genpd, dev); @@ -603,6 +618,7 @@ static void genpd_power_off_work_fn(struct work_struct *work) static int pm_genpd_runtime_suspend(struct device *dev) { struct generic_pm_domain *genpd; + struct generic_pm_domain_data *gpd_data; bool (*stop_ok)(struct device *__dev); int ret; @@ -628,6 +644,16 @@ static int pm_genpd_runtime_suspend(struct device *dev) return 0; mutex_lock(&genpd->lock); + + /* + * If we have an unknown state of the need_restore flag, it means none + * of the runtime PM callbacks has been invoked yet. Let's update the + * flag to reflect that the current state is active. + */ + gpd_data = to_gpd_data(dev->power.subsys_data->domain_data); + if (gpd_data->need_restore < 0) + gpd_data->need_restore = 0; + genpd->in_progress++; pm_genpd_poweroff(genpd); genpd->in_progress--; @@ -1442,7 +1468,7 @@ int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev, mutex_lock(&gpd_data->lock); gpd_data->base.dev = dev; list_add_tail(&gpd_data->base.list_node, &genpd->dev_list); - gpd_data->need_restore = genpd->status == GPD_STATE_POWER_OFF; + gpd_data->need_restore = -1; gpd_data->td.constraint_changed = true; gpd_data->td.effective_constraint_ns = -1; mutex_unlock(&gpd_data->lock); @@ -1546,7 +1572,7 @@ void pm_genpd_dev_need_restore(struct device *dev, bool val) psd = dev_to_psd(dev); if (psd && psd->domain_data) - to_gpd_data(psd->domain_data)->need_restore = val; + to_gpd_data(psd->domain_data)->need_restore = val ? 1 : 0; spin_unlock_irqrestore(&dev->power.lock, flags); } diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index b3ed7766a291..2e0e06daf8c0 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -106,7 +106,7 @@ struct generic_pm_domain_data { struct notifier_block nb; struct mutex lock; unsigned int refcount; - bool need_restore; + int need_restore; }; #ifdef CONFIG_PM_GENERIC_DOMAINS -- cgit v1.2.3 From 8c393f9a721c30a030049a680e1bf896669bb279 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 5 Nov 2014 22:36:50 +0800 Subject: nfs: fix pnfs direct write memory leak For pNFS direct writes, layout driver may dynamically allocate ds_cinfo.buckets. So we need to take care to free them when freeing dreq. Ideally this needs to be done inside layout driver where ds_cinfo.buckets are allocated. But buckets are attached to dreq and reused across LD IO iterations. So I feel it's OK to free them in the generic layer. Cc: stable@vger.kernel.org [v3.4+] Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 1 + include/linux/nfs_xdr.h | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 20cffc830468..10bf07280f4a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -266,6 +266,7 @@ static void nfs_direct_req_free(struct kref *kref) { struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); + nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo); if (dreq->l_ctx != NULL) nfs_put_lock_context(dreq->l_ctx); if (dreq->ctx != NULL) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 983876f24aed..47ebb4fafd87 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1224,11 +1224,22 @@ struct nfs41_free_stateid_res { unsigned int status; }; +static inline void +nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo) +{ + kfree(cinfo->buckets); +} + #else struct pnfs_ds_commit_info { }; +static inline void +nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo) +{ +} + #endif /* CONFIG_NFS_V4_1 */ #ifdef CONFIG_NFS_V4_2 -- cgit v1.2.3 From ad53f92eb416d81e469fa8ea57153e59455e7175 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 13 Nov 2014 15:19:11 -0800 Subject: mm/page_alloc: fix incorrect isolation behavior by rechecking migratetype Before describing bugs itself, I first explain definition of freepage. 1. pages on buddy list are counted as freepage. 2. pages on isolate migratetype buddy list are *not* counted as freepage. 3. pages on cma buddy list are counted as CMA freepage, too. Now, I describe problems and related patch. Patch 1: There is race conditions on getting pageblock migratetype that it results in misplacement of freepages on buddy list, incorrect freepage count and un-availability of freepage. Patch 2: Freepages on pcp list could have stale cached information to determine migratetype of buddy list to go. This causes misplacement of freepages on buddy list and incorrect freepage count. Patch 4: Merging between freepages on different migratetype of pageblocks will cause freepages accouting problem. This patch fixes it. Without patchset [3], above problem doesn't happens on my CMA allocation test, because CMA reserved pages aren't used at all. So there is no chance for above race. With patchset [3], I did simple CMA allocation test and get below result: - Virtual machine, 4 cpus, 1024 MB memory, 256 MB CMA reservation - run kernel build (make -j16) on background - 30 times CMA allocation(8MB * 30 = 240MB) attempts in 5 sec interval - Result: more than 5000 freepage count are missed With patchset [3] and this patchset, I found that no freepage count are missed so that I conclude that problems are solved. On my simple memory offlining test, these problems also occur on that environment, too. This patch (of 4): There are two paths to reach core free function of buddy allocator, __free_one_page(), one is free_one_page()->__free_one_page() and the other is free_hot_cold_page()->free_pcppages_bulk()->__free_one_page(). Each paths has race condition causing serious problems. At first, this patch is focused on first type of freepath. And then, following patch will solve the problem in second type of freepath. In the first type of freepath, we got migratetype of freeing page without holding the zone lock, so it could be racy. There are two cases of this race. 1. pages are added to isolate buddy list after restoring orignal migratetype CPU1 CPU2 get migratetype => return MIGRATE_ISOLATE call free_one_page() with MIGRATE_ISOLATE grab the zone lock unisolate pageblock release the zone lock grab the zone lock call __free_one_page() with MIGRATE_ISOLATE freepage go into isolate buddy list, although pageblock is already unisolated This may cause two problems. One is that we can't use this page anymore until next isolation attempt of this pageblock, because freepage is on isolate buddy list. The other is that freepage accouting could be wrong due to merging between different buddy list. Freepages on isolate buddy list aren't counted as freepage, but ones on normal buddy list are counted as freepage. If merge happens, buddy freepage on normal buddy list is inevitably moved to isolate buddy list without any consideration of freepage accouting so it could be incorrect. 2. pages are added to normal buddy list while pageblock is isolated. It is similar with above case. This also may cause two problems. One is that we can't keep these freepages from being allocated. Although this pageblock is isolated, freepage would be added to normal buddy list so that it could be allocated without any restriction. And the other problem is same as case 1, that it, incorrect freepage accouting. This race condition would be prevented by checking migratetype again with holding the zone lock. Because it is somewhat heavy operation and it isn't needed in common case, we want to avoid rechecking as much as possible. So this patch introduce new variable, nr_isolate_pageblock in struct zone to check if there is isolated pageblock. With this, we can avoid to re-check migratetype in common case and do it only if there is isolated pageblock or migratetype is MIGRATE_ISOLATE. This solve above mentioned problems. Changes from v3: Add one more check in free_one_page() that checks whether migratetype is MIGRATE_ISOLATE or not. Without this, abovementioned case 1 could happens. Signed-off-by: Joonsoo Kim Acked-by: Minchan Kim Acked-by: Michal Nazarewicz Acked-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Johannes Weiner Cc: Yasuaki Ishimatsu Cc: Zhang Yanfei Cc: Tang Chen Cc: Naoya Horiguchi Cc: Bartlomiej Zolnierkiewicz Cc: Wen Congyang Cc: Marek Szyprowski Cc: Laura Abbott Cc: Heesub Shin Cc: "Aneesh Kumar K.V" Cc: Ritesh Harjani Cc: Gioh Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 9 +++++++++ include/linux/page-isolation.h | 8 ++++++++ mm/page_alloc.c | 11 +++++++++-- mm/page_isolation.c | 2 ++ 4 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 48bf12ef6620..ffe66e381c04 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -431,6 +431,15 @@ struct zone { */ int nr_migrate_reserve_block; +#ifdef CONFIG_MEMORY_ISOLATION + /* + * Number of isolated pageblock. It is used to solve incorrect + * freepage counting problem due to racy retrieving migratetype + * of pageblock. Protected by zone->lock. + */ + unsigned long nr_isolate_pageblock; +#endif + #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 3fff8e774067..2dc1e1697b45 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -2,6 +2,10 @@ #define __LINUX_PAGEISOLATION_H #ifdef CONFIG_MEMORY_ISOLATION +static inline bool has_isolate_pageblock(struct zone *zone) +{ + return zone->nr_isolate_pageblock; +} static inline bool is_migrate_isolate_page(struct page *page) { return get_pageblock_migratetype(page) == MIGRATE_ISOLATE; @@ -11,6 +15,10 @@ static inline bool is_migrate_isolate(int migratetype) return migratetype == MIGRATE_ISOLATE; } #else +static inline bool has_isolate_pageblock(struct zone *zone) +{ + return false; +} static inline bool is_migrate_isolate_page(struct page *page) { return false; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9cd36b822444..df1da25b309b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -739,9 +739,16 @@ static void free_one_page(struct zone *zone, if (nr_scanned) __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + if (is_migrate_isolate(migratetype)) + goto skip_counting; + } + __mod_zone_freepage_state(zone, 1 << order, migratetype); + +skip_counting: __free_one_page(page, pfn, zone, order, migratetype); - if (unlikely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index d1473b2e9481..1fa4a4d72ecc 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -60,6 +60,7 @@ out: int migratetype = get_pageblock_migratetype(page); set_pageblock_migratetype(page, MIGRATE_ISOLATE); + zone->nr_isolate_pageblock++; nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); __mod_zone_freepage_state(zone, -nr_pages, migratetype); @@ -83,6 +84,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) nr_pages = move_freepages_block(zone, page, migratetype); __mod_zone_freepage_state(zone, nr_pages, migratetype); set_pageblock_migratetype(page, migratetype); + zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); } -- cgit v1.2.3 From f784a3f19613901ca4539a5b0eed3bdc700e6ee7 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Thu, 13 Nov 2014 15:19:39 -0800 Subject: mem-hotplug: reset node managed pages when hot-adding a new pgdat In free_area_init_core(), zone->managed_pages is set to an approximate value for lowmem, and will be adjusted when the bootmem allocator frees pages into the buddy system. But free_area_init_core() is also called by hotadd_new_pgdat() when hot-adding memory. As a result, zone->managed_pages of the newly added node's pgdat is set to an approximate value in the very beginning. Even if the memory on that node has node been onlined, /sys/device/system/node/nodeXXX/meminfo has wrong value: hot-add node2 (memory not onlined) cat /sys/device/system/node/node2/meminfo Node 2 MemTotal: 33554432 kB Node 2 MemFree: 0 kB Node 2 MemUsed: 33554432 kB Node 2 Active: 0 kB This patch fixes this problem by reset node managed pages to 0 after hot-adding a new node. 1. Move reset_managed_pages_done from reset_node_managed_pages() to reset_all_zones_managed_pages() 2. Make reset_node_managed_pages() non-static 3. Call reset_node_managed_pages() in hotadd_new_pgdat() after pgdat is initialized Signed-off-by: Tang Chen Signed-off-by: Yasuaki Ishimatsu Cc: [3.16+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 1 + mm/bootmem.c | 9 +++++---- mm/memory_hotplug.c | 9 +++++++++ mm/nobootmem.c | 8 +++++--- 4 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 4e2bd4c95b66..0995c2de8162 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -46,6 +46,7 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat, extern unsigned long init_bootmem(unsigned long addr, unsigned long memend); extern unsigned long free_all_bootmem(void); +extern void reset_node_managed_pages(pg_data_t *pgdat); extern void reset_all_zones_managed_pages(void); extern void free_bootmem_node(pg_data_t *pgdat, diff --git a/mm/bootmem.c b/mm/bootmem.c index 8a000cebb0d7..477be696511d 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -243,13 +243,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) static int reset_managed_pages_done __initdata; -static inline void __init reset_node_managed_pages(pg_data_t *pgdat) +void reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - if (reset_managed_pages_done) - return; - for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) z->managed_pages = 0; } @@ -258,8 +255,12 @@ void __init reset_all_zones_managed_pages(void) { struct pglist_data *pgdat; + if (reset_managed_pages_done) + return; + for_each_online_pgdat(pgdat) reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 252e1dbbed86..c5f7fe199a60 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -1096,6 +1097,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) build_all_zonelists(pgdat, NULL); mutex_unlock(&zonelists_mutex); + /* + * zone->managed_pages is set to an approximate value in + * free_area_init_core(), which will cause + * /sys/device/system/node/nodeX/meminfo has wrong data. + * So reset it to 0 before any memory is onlined. + */ + reset_node_managed_pages(pgdat); + return pgdat; } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 7c7ab32ee503..90b50468333e 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -145,12 +145,10 @@ static unsigned long __init free_low_memory_core_early(void) static int reset_managed_pages_done __initdata; -static inline void __init reset_node_managed_pages(pg_data_t *pgdat) +void reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - if (reset_managed_pages_done) - return; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) z->managed_pages = 0; } @@ -159,8 +157,12 @@ void __init reset_all_zones_managed_pages(void) { struct pglist_data *pgdat; + if (reset_managed_pages_done) + return; + for_each_online_pgdat(pgdat) reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } -- cgit v1.2.3