From c6e666345e1b79c62ba82339cc7d55a89cb73f88 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 2 Aug 2012 09:48:50 +0200 Subject: block: split discard into aligned requests When a disk has large discard_granularity and small max_discard_sectors, discards are not split with optimal alignment. In the limit case of discard_granularity == max_discard_sectors, no request could be aligned correctly, so in fact you might end up with no discarded logical blocks at all. Another example that helps showing the condition in the patch is with discard_granularity == 64, max_discard_sectors == 128. A request that is submitted for 256 sectors 2..257 will be split in two: 2..129, 130..257. However, only 2 aligned blocks out of 3 are included in the request; 128..191 may be left intact and not discarded. With this patch, the first request will be truncated to ensure good alignment of what's left, and the split will be 2..127, 128..255, 256..257. The patch will also take into account the discard_alignment. At most one extra request will be introduced, because the first request will be reduced by at most granularity-1 sectors, and granularity must be less than max_discard_sectors. Subsequent requests will run on round_down(max_discard_sectors, granularity) sectors, as in the current code. Signed-off-by: Paolo Bonzini Acked-by: Vivek Goyal Tested-by: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4e72a9d48232..281516ae8b4e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1139,6 +1139,16 @@ static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector & (lim->discard_granularity - 1); } +static inline int bdev_discard_alignment(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (bdev != bdev->bd_contains) + return bdev->bd_part->discard_alignment; + + return q->limits.discard_alignment; +} + static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) { if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) -- cgit v1.2.3 From 85b9f66a41eb8ee3f1dfc95707412705463cdd97 Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 2 Aug 2012 23:42:04 +0200 Subject: block: Add blk_bio_map_sg() helper Add a helper to map a bio to a scatterlist, modelled after blk_rq_map_sg. This helper is useful for any driver that wants to create a scatterlist from its ->make_request_fn method. Changes in v2: - Use __blk_segment_map_sg to avoid duplicated code - Add cocbook style function comment Cc: Rusty Russell Cc: Christoph Hellwig Cc: Tejun Heo Cc: Shaohua Li Cc: "Michael S. Tsirkin" Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: virtualization@lists.linux-foundation.org Signed-off-by: Christoph Hellwig Signed-off-by: Minchan Kim Signed-off-by: Asias He Signed-off-by: Jens Axboe --- block/blk-merge.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 2 ++ 2 files changed, 39 insertions(+) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index 576b68e79248..e76279e41162 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -209,6 +209,43 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_sg); +/** + * blk_bio_map_sg - map a bio to a scatterlist + * @q: request_queue in question + * @bio: bio being mapped + * @sglist: scatterlist being mapped + * + * Note: + * Caller must make sure sg can hold bio->bi_phys_segments entries + * + * Will return the number of sg entries setup + */ +int blk_bio_map_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist) +{ + struct bio_vec *bvec, *bvprv; + struct scatterlist *sg; + int nsegs, cluster; + unsigned long i; + + nsegs = 0; + cluster = blk_queue_cluster(q); + + bvprv = NULL; + sg = NULL; + bio_for_each_segment(bvec, bio, i) { + __blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg, + &nsegs, &cluster); + } /* segments in bio */ + + if (sg) + sg_mark_end(sg); + + BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); + return nsegs; +} +EXPORT_SYMBOL(blk_bio_map_sg); + static inline int ll_new_hw_segment(struct request_queue *q, struct request *req, struct bio *bio) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 281516ae8b4e..dc632975d54f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -894,6 +894,8 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); +extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist); extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); -- cgit v1.2.3 From 276f0f5d157bb4a816053f4f3a941dbcd4f76556 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 9 Aug 2012 15:20:23 +0200 Subject: block: disable discard request merge temporarily The SCSI discard request merge never worked, and looks no solution for in future, let's disable it temporarily. Signed-off-by: Shaohua Li Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dc632975d54f..4a2ab7c85393 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -601,7 +601,7 @@ static inline void blk_clear_rl_full(struct request_list *rl, bool sync) * it already be started by driver. */ #define RQ_NOMERGE_FLAGS \ - (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) + (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_DISCARD) #define rq_mergeable(rq) \ (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \ (((rq)->cmd_flags & REQ_DISCARD) || \ -- cgit v1.2.3 From 4e8b14526ca7fb046a81c94002c1c43b6fdf0e9b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 8 Aug 2012 15:36:20 -0400 Subject: time: Improve sanity checking of timekeeping inputs Unexpected behavior could occur if the time is set to a value large enough to overflow a 64bit ktime_t (which is something larger then the year 2262). Also unexpected behavior could occur if large negative offsets are injected via adjtimex. So this patch improves the sanity check timekeeping inputs by improving the timespec_valid() check, and then makes better use of timespec_valid() to make sure we don't set the time to an invalid negative value or one that overflows ktime_t. Note: This does not protect from setting the time close to overflowing ktime_t and then letting natural accumulation cause the overflow. Reported-by: CAI Qian Reported-by: Sasha Levin Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Zhouping Liu Cc: Ingo Molnar Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1344454580-17031-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- include/linux/ktime.h | 7 ------- include/linux/time.h | 22 ++++++++++++++++++++-- kernel/time/timekeeping.c | 26 ++++++++++++++++++++++++-- 3 files changed, 44 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 603bec2913b0..06177ba10a16 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -58,13 +58,6 @@ union ktime { typedef union ktime ktime_t; /* Kill this */ -#define KTIME_MAX ((s64)~((u64)1 << 63)) -#if (BITS_PER_LONG == 64) -# define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) -#else -# define KTIME_SEC_MAX LONG_MAX -#endif - /* * ktime_t definitions when using the 64-bit scalar representation: */ diff --git a/include/linux/time.h b/include/linux/time.h index c81c5e40fcb5..b0bbd8f0130d 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -107,11 +107,29 @@ static inline struct timespec timespec_sub(struct timespec lhs, return ts_delta; } +#define KTIME_MAX ((s64)~((u64)1 << 63)) +#if (BITS_PER_LONG == 64) +# define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) +#else +# define KTIME_SEC_MAX LONG_MAX +#endif + /* * Returns true if the timespec is norm, false if denorm: */ -#define timespec_valid(ts) \ - (((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC)) +static inline bool timespec_valid(const struct timespec *ts) +{ + /* Dates before 1970 are bogus */ + if (ts->tv_sec < 0) + return false; + /* Can't have more nanoseconds then a second */ + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return false; + /* Disallow values that could overflow ktime_t */ + if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + return false; + return true; +} extern void read_persistent_clock(struct timespec *ts); extern void read_boot_clock(struct timespec *ts); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e16af197a2bc..898bef066a44 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -427,7 +427,7 @@ int do_settimeofday(const struct timespec *tv) struct timespec ts_delta, xt; unsigned long flags; - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + if (!timespec_valid(tv)) return -EINVAL; write_seqlock_irqsave(&tk->lock, flags); @@ -463,6 +463,8 @@ int timekeeping_inject_offset(struct timespec *ts) { struct timekeeper *tk = &timekeeper; unsigned long flags; + struct timespec tmp; + int ret = 0; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; @@ -471,10 +473,17 @@ int timekeeping_inject_offset(struct timespec *ts) timekeeping_forward_now(tk); + /* Make sure the proposed value is valid */ + tmp = timespec_add(tk_xtime(tk), *ts); + if (!timespec_valid(&tmp)) { + ret = -EINVAL; + goto error; + } tk_xtime_add(tk, ts); tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); +error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(tk, true); write_sequnlock_irqrestore(&tk->lock, flags); @@ -482,7 +491,7 @@ int timekeeping_inject_offset(struct timespec *ts) /* signal hrtimers about time change */ clock_was_set(); - return 0; + return ret; } EXPORT_SYMBOL(timekeeping_inject_offset); @@ -649,7 +658,20 @@ void __init timekeeping_init(void) struct timespec now, boot, tmp; read_persistent_clock(&now); + if (!timespec_valid(&now)) { + pr_warn("WARNING: Persistent clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + now.tv_sec = 0; + now.tv_nsec = 0; + } + read_boot_clock(&boot); + if (!timespec_valid(&boot)) { + pr_warn("WARNING: Boot clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + boot.tv_sec = 0; + boot.tv_nsec = 0; + } seqlock_init(&tk->lock); -- cgit v1.2.3 From 58569aee5a1a5dcc25c34a0a2ed9a377874e6b05 Mon Sep 17 00:00:00 2001 From: "Arnaud Patard (Rtp)" Date: Thu, 26 Jul 2012 12:15:46 +0200 Subject: ARM: Orion: Set eth packet size csum offload limit The mv643xx ethernet controller limits the packet size for the TX checksum offloading. This patch sets this limits for Kirkwood and Dove which have smaller limits that the default. As a side note, this patch is an updated version of a patch sent some years ago: http://lists.infradead.org/pipermail/linux-arm-kernel/2010-June/017320.html which seems to have been lost. Signed-off-by: Arnaud Patard Signed-off-by: Jason Cooper Cc: --- arch/arm/mach-dove/common.c | 3 ++- arch/arm/mach-kirkwood/common.c | 4 ++-- arch/arm/mach-mv78xx0/common.c | 6 ++++-- arch/arm/mach-orion5x/common.c | 3 ++- arch/arm/plat-orion/common.c | 8 ++++++-- arch/arm/plat-orion/include/plat/common.h | 6 ++++-- include/linux/mv643xx_eth.h | 2 ++ 7 files changed, 22 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mach-dove/common.c b/arch/arm/mach-dove/common.c index 4db5de54b6a7..6321567d8eaa 100644 --- a/arch/arm/mach-dove/common.c +++ b/arch/arm/mach-dove/common.c @@ -102,7 +102,8 @@ void __init dove_ehci1_init(void) void __init dove_ge00_init(struct mv643xx_eth_platform_data *eth_data) { orion_ge00_init(eth_data, DOVE_GE00_PHYS_BASE, - IRQ_DOVE_GE00_SUM, IRQ_DOVE_GE00_ERR); + IRQ_DOVE_GE00_SUM, IRQ_DOVE_GE00_ERR, + 1600); } /***************************************************************************** diff --git a/arch/arm/mach-kirkwood/common.c b/arch/arm/mach-kirkwood/common.c index c4b64adcbfce..3226077735b1 100644 --- a/arch/arm/mach-kirkwood/common.c +++ b/arch/arm/mach-kirkwood/common.c @@ -301,7 +301,7 @@ void __init kirkwood_ge00_init(struct mv643xx_eth_platform_data *eth_data) { orion_ge00_init(eth_data, GE00_PHYS_BASE, IRQ_KIRKWOOD_GE00_SUM, - IRQ_KIRKWOOD_GE00_ERR); + IRQ_KIRKWOOD_GE00_ERR, 1600); /* The interface forgets the MAC address assigned by u-boot if the clock is turned off, so claim the clk now. */ clk_prepare_enable(ge0); @@ -315,7 +315,7 @@ void __init kirkwood_ge01_init(struct mv643xx_eth_platform_data *eth_data) { orion_ge01_init(eth_data, GE01_PHYS_BASE, IRQ_KIRKWOOD_GE01_SUM, - IRQ_KIRKWOOD_GE01_ERR); + IRQ_KIRKWOOD_GE01_ERR, 1600); clk_prepare_enable(ge1); } diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c index b4c53b846c9c..3057f7d4329a 100644 --- a/arch/arm/mach-mv78xx0/common.c +++ b/arch/arm/mach-mv78xx0/common.c @@ -213,7 +213,8 @@ void __init mv78xx0_ge00_init(struct mv643xx_eth_platform_data *eth_data) { orion_ge00_init(eth_data, GE00_PHYS_BASE, IRQ_MV78XX0_GE00_SUM, - IRQ_MV78XX0_GE_ERR); + IRQ_MV78XX0_GE_ERR, + MV643XX_TX_CSUM_DEFAULT_LIMIT); } @@ -224,7 +225,8 @@ void __init mv78xx0_ge01_init(struct mv643xx_eth_platform_data *eth_data) { orion_ge01_init(eth_data, GE01_PHYS_BASE, IRQ_MV78XX0_GE01_SUM, - NO_IRQ); + NO_IRQ, + MV643XX_TX_CSUM_DEFAULT_LIMIT); } diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c index 9148b229d0de..410291c67666 100644 --- a/arch/arm/mach-orion5x/common.c +++ b/arch/arm/mach-orion5x/common.c @@ -109,7 +109,8 @@ void __init orion5x_eth_init(struct mv643xx_eth_platform_data *eth_data) { orion_ge00_init(eth_data, ORION5X_ETH_PHYS_BASE, IRQ_ORION5X_ETH_SUM, - IRQ_ORION5X_ETH_ERR); + IRQ_ORION5X_ETH_ERR, + MV643XX_TX_CSUM_DEFAULT_LIMIT); } diff --git a/arch/arm/plat-orion/common.c b/arch/arm/plat-orion/common.c index d245a87dc014..b8b747a9d360 100644 --- a/arch/arm/plat-orion/common.c +++ b/arch/arm/plat-orion/common.c @@ -291,10 +291,12 @@ static struct platform_device orion_ge00 = { void __init orion_ge00_init(struct mv643xx_eth_platform_data *eth_data, unsigned long mapbase, unsigned long irq, - unsigned long irq_err) + unsigned long irq_err, + unsigned int tx_csum_limit) { fill_resources(&orion_ge00_shared, orion_ge00_shared_resources, mapbase + 0x2000, SZ_16K - 1, irq_err); + orion_ge00_shared_data.tx_csum_limit = tx_csum_limit; ge_complete(&orion_ge00_shared_data, orion_ge00_resources, irq, &orion_ge00_shared, eth_data, &orion_ge00); @@ -343,10 +345,12 @@ static struct platform_device orion_ge01 = { void __init orion_ge01_init(struct mv643xx_eth_platform_data *eth_data, unsigned long mapbase, unsigned long irq, - unsigned long irq_err) + unsigned long irq_err, + unsigned int tx_csum_limit) { fill_resources(&orion_ge01_shared, orion_ge01_shared_resources, mapbase + 0x2000, SZ_16K - 1, irq_err); + orion_ge01_shared_data.tx_csum_limit = tx_csum_limit; ge_complete(&orion_ge01_shared_data, orion_ge01_resources, irq, &orion_ge01_shared, eth_data, &orion_ge01); diff --git a/arch/arm/plat-orion/include/plat/common.h b/arch/arm/plat-orion/include/plat/common.h index e00fdb213609..ae2377ef63e5 100644 --- a/arch/arm/plat-orion/include/plat/common.h +++ b/arch/arm/plat-orion/include/plat/common.h @@ -39,12 +39,14 @@ void __init orion_rtc_init(unsigned long mapbase, void __init orion_ge00_init(struct mv643xx_eth_platform_data *eth_data, unsigned long mapbase, unsigned long irq, - unsigned long irq_err); + unsigned long irq_err, + unsigned int tx_csum_limit); void __init orion_ge01_init(struct mv643xx_eth_platform_data *eth_data, unsigned long mapbase, unsigned long irq, - unsigned long irq_err); + unsigned long irq_err, + unsigned int tx_csum_limit); void __init orion_ge10_init(struct mv643xx_eth_platform_data *eth_data, unsigned long mapbase, diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h index 51bf8ada6dc0..49258e0ed1c6 100644 --- a/include/linux/mv643xx_eth.h +++ b/include/linux/mv643xx_eth.h @@ -15,6 +15,8 @@ #define MV643XX_ETH_SIZE_REG_4 0x2224 #define MV643XX_ETH_BASE_ADDR_ENABLE_REG 0x2290 +#define MV643XX_TX_CSUM_DEFAULT_LIMIT 0 + struct mv643xx_eth_shared_platform_data { struct mbus_dram_target_info *dram; struct platform_device *shared_smi; -- cgit v1.2.3 From c7a9b09b1a4a1fbccb2ec409daec95f9068d77c0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Aug 2012 20:51:54 +0000 Subject: ARM: omap: allow building omap44xx without SMP The new omap4 cpuidle implementation currently requires ARCH_NEEDS_CPU_IDLE_COUPLED, which only works on SMP. This patch makes it possible to build a non-SMP kernel for that platform. This is not normally desired for end-users but can be useful for testing. Without this patch, building rand-0y2jSKT results in: drivers/cpuidle/coupled.c: In function 'cpuidle_coupled_poke': drivers/cpuidle/coupled.c:317:3: error: implicit declaration of function '__smp_call_function_single' [-Werror=implicit-function-declaration] It's not clear if this patch is the best solution for the problem at hand. I have made sure that we can now build the kernel in all configurations, but that does not mean it will actually work on an OMAP44xx. Signed-off-by: Arnd Bergmann Acked-by: Santosh Shilimkar Tested-by: Santosh Shilimkar Cc: Kevin Hilman Cc: Tony Lindgren --- arch/arm/mach-omap2/Kconfig | 2 +- arch/arm/mach-omap2/cpuidle44xx.c | 3 ++- include/linux/cpuidle.h | 4 ++++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig index dd2db025f778..66a8be331caf 100644 --- a/arch/arm/mach-omap2/Kconfig +++ b/arch/arm/mach-omap2/Kconfig @@ -62,7 +62,7 @@ config ARCH_OMAP4 select PM_OPP if PM select USB_ARCH_HAS_EHCI if USB_SUPPORT select ARM_CPU_SUSPEND if PM - select ARCH_NEEDS_CPU_IDLE_COUPLED + select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP config SOC_OMAP5 bool "TI OMAP5" diff --git a/arch/arm/mach-omap2/cpuidle44xx.c b/arch/arm/mach-omap2/cpuidle44xx.c index ee05e193fc61..288bee6cbb76 100644 --- a/arch/arm/mach-omap2/cpuidle44xx.c +++ b/arch/arm/mach-omap2/cpuidle44xx.c @@ -238,8 +238,9 @@ int __init omap4_idle_init(void) for_each_cpu(cpu_id, cpu_online_mask) { dev = &per_cpu(omap4_idle_dev, cpu_id); dev->cpu = cpu_id; +#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED dev->coupled_cpus = *cpu_online_mask; - +#endif cpuidle_register_driver(&omap4_idle_driver); if (cpuidle_register_device(dev)) { diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 040b13b5c14a..279b1eaa8b73 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -194,6 +194,10 @@ static inline int cpuidle_play_dead(void) {return -ENODEV; } #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a); +#else +static inline void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, atomic_t *a) +{ +} #endif /****************************** -- cgit v1.2.3 From cee58483cf56e0ba355fdd97ff5e8925329aa936 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 31 Aug 2012 13:30:06 -0400 Subject: time: Move ktime_t overflow checking into timespec_valid_strict Andreas Bombe reported that the added ktime_t overflow checking added to timespec_valid in commit 4e8b14526ca7 ("time: Improve sanity checking of timekeeping inputs") was causing problems with X.org because it caused timeouts larger then KTIME_T to be invalid. Previously, these large timeouts would be clamped to KTIME_MAX and would never expire, which is valid. This patch splits the ktime_t overflow checking into a new timespec_valid_strict function, and converts the timekeeping codes internal checking to use this more strict function. Reported-and-tested-by: Andreas Bombe Cc: Zhouping Liu Cc: Ingo Molnar Cc: Prarit Bhargava Cc: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: John Stultz Signed-off-by: Linus Torvalds --- include/linux/time.h | 7 +++++++ kernel/time/timekeeping.c | 10 +++++----- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/time.h b/include/linux/time.h index b0bbd8f0130d..b51e664c83e7 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -125,6 +125,13 @@ static inline bool timespec_valid(const struct timespec *ts) /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; + return true; +} + +static inline bool timespec_valid_strict(const struct timespec *ts) +{ + if (!timespec_valid(ts)) + return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0c1485e42be6..34e5eac81424 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -428,7 +428,7 @@ int do_settimeofday(const struct timespec *tv) struct timespec ts_delta, xt; unsigned long flags; - if (!timespec_valid(tv)) + if (!timespec_valid_strict(tv)) return -EINVAL; write_seqlock_irqsave(&tk->lock, flags); @@ -476,7 +476,7 @@ int timekeeping_inject_offset(struct timespec *ts) /* Make sure the proposed value is valid */ tmp = timespec_add(tk_xtime(tk), *ts); - if (!timespec_valid(&tmp)) { + if (!timespec_valid_strict(&tmp)) { ret = -EINVAL; goto error; } @@ -659,7 +659,7 @@ void __init timekeeping_init(void) struct timespec now, boot, tmp; read_persistent_clock(&now); - if (!timespec_valid(&now)) { + if (!timespec_valid_strict(&now)) { pr_warn("WARNING: Persistent clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); now.tv_sec = 0; @@ -667,7 +667,7 @@ void __init timekeeping_init(void) } read_boot_clock(&boot); - if (!timespec_valid(&boot)) { + if (!timespec_valid_strict(&boot)) { pr_warn("WARNING: Boot clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); boot.tv_sec = 0; @@ -713,7 +713,7 @@ static struct timespec timekeeping_suspend_time; static void __timekeeping_inject_sleeptime(struct timekeeper *tk, struct timespec *delta) { - if (!timespec_valid(delta)) { + if (!timespec_valid_strict(delta)) { printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " "sleep delta value!\n"); return; -- cgit v1.2.3 From b6d86d3d6d6e4c9b588d81615c81b5a8292b62ed Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 24 Aug 2012 17:25:01 -0700 Subject: linux/kernel.h: Fix DIV_ROUND_CLOSEST to support negative dividends DIV_ROUND_CLOSEST returns a bad result for negative dividends: DIV_ROUND_CLOSEST(-2, 2) = 0 Most of the time this does not matter. However, in the hardware monitoring subsystem, DIV_ROUND_CLOSEST is sometimes used on integers which can be negative (such as temperatures). Signed-off-by: Guenter Roeck Acked-by: Jean Delvare --- include/linux/kernel.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 604382143bcf..594b419b7d20 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -82,10 +82,18 @@ __x - (__x % (y)); \ } \ ) + +/* + * Divide positive or negative dividend by positive divisor and round + * to closest integer. Result is undefined for negative divisors. + */ #define DIV_ROUND_CLOSEST(x, divisor)( \ { \ - typeof(divisor) __divisor = divisor; \ - (((x) + ((__divisor) / 2)) / (__divisor)); \ + typeof(x) __x = x; \ + typeof(divisor) __d = divisor; \ + (((typeof(x))-1) >= 0 || (__x) >= 0) ? \ + (((__x) + ((__d) / 2)) / (__d)) : \ + (((__x) - ((__d) / 2)) / (__d)); \ } \ ) -- cgit v1.2.3 From a6fa941d94b411bbd2b6421ffbde6db3c93e65ab Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Aug 2012 14:59:25 +0100 Subject: perf_event: Switch to internal refcount, fix race with close() Don't mess with file refcounts (or keep a reference to file, for that matter) in perf_event. Use explicit refcount of its own instead. Deal with the race between the final reference to event going away and new children getting created for it by use of atomic_long_inc_not_zero() in inherit_event(); just have the latter free what it had allocated and return NULL, that works out just fine (children of siblings of something doomed are created as singletons, same as if the child of leader had been created and immediately killed). Signed-off-by: Al Viro Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120820135925.GG23464@ZenIV.linux.org.uk Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 62 ++++++++++++++++++++++++---------------------- 2 files changed, 34 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7602ccb3f40e..ad04dfcd6f35 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -926,7 +926,7 @@ struct perf_event { struct hw_perf_event hw; struct perf_event_context *ctx; - struct file *filp; + atomic_long_t refcount; /* * These accumulate total time (in nanoseconds) that children diff --git a/kernel/events/core.c b/kernel/events/core.c index b7935fcec7d9..efef4282b8e8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2935,12 +2935,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ -static int perf_release(struct inode *inode, struct file *file) +static void put_event(struct perf_event *event) { - struct perf_event *event = file->private_data; struct task_struct *owner; - file->private_data = NULL; + if (!atomic_long_dec_and_test(&event->refcount)) + return; rcu_read_lock(); owner = ACCESS_ONCE(event->owner); @@ -2975,7 +2975,13 @@ static int perf_release(struct inode *inode, struct file *file) put_task_struct(owner); } - return perf_event_release_kernel(event); + perf_event_release_kernel(event); +} + +static int perf_release(struct inode *inode, struct file *file) +{ + put_event(file->private_data); + return 0; } u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) @@ -3227,7 +3233,7 @@ unlock: static const struct file_operations perf_fops; -static struct perf_event *perf_fget_light(int fd, int *fput_needed) +static struct file *perf_fget_light(int fd, int *fput_needed) { struct file *file; @@ -3241,7 +3247,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed) return ERR_PTR(-EBADF); } - return file->private_data; + return file; } static int perf_event_set_output(struct perf_event *event, @@ -3273,19 +3279,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: { + struct file *output_file = NULL; struct perf_event *output_event = NULL; int fput_needed = 0; int ret; if (arg != -1) { - output_event = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_event)) - return PTR_ERR(output_event); + output_file = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_file)) + return PTR_ERR(output_file); + output_event = output_file->private_data; } ret = perf_event_set_output(event, output_event); if (output_event) - fput_light(output_event->filp, fput_needed); + fput_light(output_file, fput_needed); return ret; } @@ -5950,6 +5958,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, mutex_init(&event->mmap_mutex); + atomic_long_set(&event->refcount, 1); event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; @@ -6260,12 +6269,12 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; if (group_fd != -1) { - group_leader = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_leader)) { - err = PTR_ERR(group_leader); + group_file = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_file)) { + err = PTR_ERR(group_file); goto err_fd; } - group_file = group_leader->filp; + group_leader = group_file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6402,7 +6411,6 @@ SYSCALL_DEFINE5(perf_event_open, put_ctx(gctx); } - event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); @@ -6496,7 +6504,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_free; } - event->filp = NULL; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); @@ -6578,7 +6585,7 @@ static void sync_child_event(struct perf_event *child_event, * Release the parent event, if this was the last * reference to it. */ - fput(parent_event->filp); + put_event(parent_event); } static void @@ -6654,9 +6661,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * * __perf_event_exit_task() * sync_child_event() - * fput(parent_event->filp) - * perf_release() - * mutex_lock(&ctx->mutex) + * put_event() + * mutex_lock(&ctx->mutex) * * But since its the parent context it won't be the same instance. */ @@ -6724,7 +6730,7 @@ static void perf_free_event(struct perf_event *event, list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); - fput(parent->filp); + put_event(parent); perf_group_detach(event); list_del_event(event, ctx); @@ -6804,6 +6810,12 @@ inherit_event(struct perf_event *parent_event, NULL, NULL); if (IS_ERR(child_event)) return child_event; + + if (!atomic_long_inc_not_zero(&parent_event->refcount)) { + free_event(child_event); + return NULL; + } + get_ctx(child_ctx); /* @@ -6844,14 +6856,6 @@ inherit_event(struct perf_event *parent_event, add_event_to_ctx(child_event, child_ctx); raw_spin_unlock_irqrestore(&child_ctx->lock, flags); - /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - /* * Link this into the parent event's child list */ -- cgit v1.2.3 From 500ad2d8b01390c98bc6dce068bccfa9534b8212 Mon Sep 17 00:00:00 2001 From: "K.Prasad" Date: Thu, 2 Aug 2012 13:46:35 +0530 Subject: perf/hwpb: Invoke __perf_event_disable() if interrupts are already disabled While debugging a warning message on PowerPC while using hardware breakpoints, it was discovered that when perf_event_disable is invoked through hw_breakpoint_handler function with interrupts disabled, a subsequent IPI in the code path would trigger a WARN_ON_ONCE message in smp_call_function_single function. This patch calls __perf_event_disable() when interrupts are already disabled, instead of perf_event_disable(). Reported-by: Edjunior Barbosa Machado Signed-off-by: K.Prasad [naveen.n.rao@linux.vnet.ibm.com: v3: Check to make sure we target current task] Signed-off-by: Naveen N. Rao Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120802081635.5811.17737.stgit@localhost.localdomain [ Fixed build error on MIPS. ] Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 ++ kernel/events/core.c | 2 +- kernel/events/hw_breakpoint.c | 11 ++++++++++- 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ad04dfcd6f35..33ed9d605f91 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1296,6 +1296,7 @@ extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); +extern int __perf_event_disable(void *info); extern void perf_event_task_tick(void); #else static inline void @@ -1334,6 +1335,7 @@ static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } +static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } #endif diff --git a/kernel/events/core.c b/kernel/events/core.c index efef4282b8e8..7fee567153f0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1253,7 +1253,7 @@ retry: /* * Cross CPU call to disable a performance event */ -static int __perf_event_disable(void *info) +int __perf_event_disable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index bb38c4d3ee12..9a7b487c6fe2 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att int old_type = bp->attr.bp_type; int err = 0; - perf_event_disable(bp); + /* + * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it + * will not be possible to raise IPIs that invoke __perf_event_disable. + * So call the function directly after making sure we are targeting the + * current task. + */ + if (irqs_disabled() && bp->ctx && bp->ctx->task == current) + __perf_event_disable(bp); + else + perf_event_disable(bp); bp->attr.bp_addr = attr->bp_addr; bp->attr.bp_type = attr->bp_type; -- cgit v1.2.3 From 3550ccdb9d8d350e526b809bf3dd92b550a74fe1 Mon Sep 17 00:00:00 2001 From: Ian Chen Date: Wed, 29 Aug 2012 15:05:36 +0900 Subject: mmc: card: Skip secure erase on MoviNAND; causes unrecoverable corruption. For several MoviNAND eMMC parts, there are known issues with secure erase and secure trim. For these specific MoviNAND devices, we skip these operations. Specifically, there is a bug in the eMMC firmware that causes unrecoverable corruption when the MMC is erased with MMC_CAP_ERASE enabled. References: http://forum.xda-developers.com/showthread.php?t=1644364 https://plus.google.com/111398485184813224730/posts/21pTYfTsCkB#111398485184813224730/posts/21pTYfTsCkB Signed-off-by: Ian Chen Reviewed-by: Namjae Jeon Acked-by: Jaehoon Chung Reviewed-by: Linus Walleij Cc: stable [3.0+] Signed-off-by: Chris Ball --- drivers/mmc/card/block.c | 26 +++++++++++++++++++++++++- include/linux/mmc/card.h | 1 + 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index f1c84decb192..172a768036d8 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -1411,7 +1411,8 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) /* complete ongoing async transfer before issuing discard */ if (card->host->areq) mmc_blk_issue_rw_rq(mq, NULL); - if (req->cmd_flags & REQ_SECURE) + if (req->cmd_flags & REQ_SECURE && + !(card->quirks & MMC_QUIRK_SEC_ERASE_TRIM_BROKEN)) ret = mmc_blk_issue_secdiscard_rq(mq, req); else ret = mmc_blk_issue_discard_rq(mq, req); @@ -1716,6 +1717,7 @@ force_ro_fail: #define CID_MANFID_SANDISK 0x2 #define CID_MANFID_TOSHIBA 0x11 #define CID_MANFID_MICRON 0x13 +#define CID_MANFID_SAMSUNG 0x15 static const struct mmc_fixup blk_fixups[] = { @@ -1752,6 +1754,28 @@ static const struct mmc_fixup blk_fixups[] = MMC_FIXUP(CID_NAME_ANY, CID_MANFID_MICRON, 0x200, add_quirk_mmc, MMC_QUIRK_LONG_READ_TIME), + /* + * On these Samsung MoviNAND parts, performing secure erase or + * secure trim can result in unrecoverable corruption due to a + * firmware bug. + */ + MMC_FIXUP("M8G2FA", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, + MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), + MMC_FIXUP("MAG4FA", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, + MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), + MMC_FIXUP("MBG8FA", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, + MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), + MMC_FIXUP("MCGAFA", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, + MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), + MMC_FIXUP("VAL00M", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, + MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), + MMC_FIXUP("VYL00M", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, + MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), + MMC_FIXUP("KYL00M", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, + MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), + MMC_FIXUP("VZL00M", CID_MANFID_SAMSUNG, CID_OEMID_ANY, add_quirk_mmc, + MMC_QUIRK_SEC_ERASE_TRIM_BROKEN), + END_FIXUP }; diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 111aca5e97f3..4b27f9f503e4 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -239,6 +239,7 @@ struct mmc_card { #define MMC_QUIRK_BLK_NO_CMD23 (1<<7) /* Avoid CMD23 for regular multiblock */ #define MMC_QUIRK_BROKEN_BYTE_MODE_512 (1<<8) /* Avoid sending 512 bytes in */ #define MMC_QUIRK_LONG_READ_TIME (1<<9) /* Data read time > CSD says */ +#define MMC_QUIRK_SEC_ERASE_TRIM_BROKEN (1<<10) /* Skip secure for erase/trim */ /* byte mode */ unsigned int poweroff_notify_state; /* eMMC4.5 notify feature */ #define MMC_NO_POWER_NOTIFICATION 0 -- cgit v1.2.3 From c3f52af3e03013db5237e339c817beaae5ec9e3a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 3 Sep 2012 14:56:02 -0400 Subject: NFS: Fix the initialisation of the readdir 'cookieverf' array When the NFS_COOKIEVERF helper macro was converted into a static inline function in commit 99fadcd764 (nfs: convert NFS_*(inode) helpers to static inline), we broke the initialisation of the readdir cookies, since that depended on doing a memset with an argument of 'sizeof(NFS_COOKIEVERF(inode))' which therefore changed from sizeof(be32 cookieverf[2]) to sizeof(be32 *). At this point, NFS_COOKIEVERF seems to be more of an obfuscation than a helper, so the best thing would be to just get rid of it. Also see: https://bugzilla.kernel.org/show_bug.cgi?id=46881 Reported-by: Andi Kleen Reported-by: David Binderman Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/inode.c | 2 +- fs/nfs/nfs3proc.c | 2 +- fs/nfs/nfs4proc.c | 4 ++-- include/linux/nfs_fs.h | 5 ----- 4 files changed, 4 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c6e895f0fbf3..9b47610338f5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -154,7 +154,7 @@ static void nfs_zap_caches_locked(struct inode *inode) nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; - memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); + memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; else diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d6b3b5f2d779..69322096c325 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -643,7 +643,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, u64 cookie, struct page **pages, unsigned int count, int plus) { struct inode *dir = dentry->d_inode; - __be32 *verf = NFS_COOKIEVERF(dir); + __be32 *verf = NFS_I(dir)->cookieverf; struct nfs3_readdirargs arg = { .fh = NFS_FH(dir), .cookie = cookie, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 635274140b18..86b4c7361036 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3215,11 +3215,11 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long long)cookie); - nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); + nfs4_setup_readdir(cookie, NFS_I(dir)->cookieverf, dentry, &args); res.pgbase = args.pgbase; status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0); if (status >= 0) { - memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); + memcpy(NFS_I(dir)->cookieverf, res.verifier.data, NFS4_VERIFIER_SIZE); status += args.pgbase; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 1f8fc7f9bcd8..4b03f56e280e 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -265,11 +265,6 @@ static inline const struct nfs_rpc_ops *NFS_PROTO(const struct inode *inode) return NFS_SERVER(inode)->nfs_client->rpc_ops; } -static inline __be32 *NFS_COOKIEVERF(const struct inode *inode) -{ - return NFS_I(inode)->cookieverf; -} - static inline unsigned NFS_MINATTRTIMEO(const struct inode *inode) { struct nfs_server *nfss = NFS_SERVER(inode); -- cgit v1.2.3 From 1f1ea6c2d9d8c0be9ec56454b05315273b5de8ce Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 26 Aug 2012 11:44:43 -0700 Subject: NFSv4: Fix buffer overflow checking in __nfs4_get_acl_uncached Pass the checks made by decode_getacl back to __nfs4_get_acl_uncached so that it knows if the acl has been truncated. The current overflow checking is broken, resulting in Oopses on user-triggered nfs4_getfacl calls, and is opaque to the point where several attempts at fixing it have failed. This patch tries to clean up the code in addition to fixing the Oopses by ensuring that the overflow checks are performed in a single place (decode_getacl). If the overflow check failed, we will still be able to report the acl length, but at least we will no longer attempt to cache the acl or copy the truncated contents to user space. Reported-by: Sachin Prabhu Signed-off-by: Trond Myklebust Tested-by: Sachin Prabhu --- fs/nfs/nfs4proc.c | 31 ++++++++++++------------------- fs/nfs/nfs4xdr.c | 14 +++++--------- include/linux/nfs_xdr.h | 2 +- 3 files changed, 18 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6b94f2d52532..1e50326d00dd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3739,7 +3739,7 @@ static void nfs4_write_cached_acl(struct inode *inode, struct page **pages, size struct nfs4_cached_acl *acl; size_t buflen = sizeof(*acl) + acl_len; - if (pages && buflen <= PAGE_SIZE) { + if (buflen <= PAGE_SIZE) { acl = kmalloc(buflen, GFP_KERNEL); if (acl == NULL) goto out; @@ -3784,7 +3784,6 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu }; unsigned int npages = DIV_ROUND_UP(buflen, PAGE_SIZE); int ret = -ENOMEM, i; - size_t acl_len = 0; /* As long as we're doing a round trip to the server anyway, * let's be prepared for a page of acl data. */ @@ -3807,11 +3806,6 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu args.acl_len = npages * PAGE_SIZE; args.acl_pgbase = 0; - /* Let decode_getfacl know not to fail if the ACL data is larger than - * the page we send as a guess */ - if (buf == NULL) - res.acl_flags |= NFS4_ACL_LEN_REQUEST; - dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n", __func__, buf, buflen, npages, args.acl_len); ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), @@ -3819,20 +3813,19 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu if (ret) goto out_free; - acl_len = res.acl_len; - if (acl_len > args.acl_len) - nfs4_write_cached_acl(inode, NULL, 0, acl_len); - else - nfs4_write_cached_acl(inode, pages, res.acl_data_offset, - acl_len); - if (buf) { + /* Handle the case where the passed-in buffer is too short */ + if (res.acl_flags & NFS4_ACL_TRUNC) { + /* Did the user only issue a request for the acl length? */ + if (buf == NULL) + goto out_ok; ret = -ERANGE; - if (acl_len > buflen) - goto out_free; - _copy_from_pages(buf, pages, res.acl_data_offset, - acl_len); + goto out_free; } - ret = acl_len; + nfs4_write_cached_acl(inode, pages, res.acl_data_offset, res.acl_len); + if (buf) + _copy_from_pages(buf, pages, res.acl_data_offset, res.acl_len); +out_ok: + ret = res.acl_len; out_free: for (i = 0; i < npages; i++) if (pages[i]) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1bfbd67c556d..541e796e6db5 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -5072,18 +5072,14 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, * are stored with the acl data to handle the problem of * variable length bitmaps.*/ res->acl_data_offset = xdr_stream_pos(xdr) - pg_offset; - - /* We ignore &savep and don't do consistency checks on - * the attr length. Let userspace figure it out.... */ res->acl_len = attrlen; - if (attrlen > (xdr->nwords << 2)) { - if (res->acl_flags & NFS4_ACL_LEN_REQUEST) { - /* getxattr interface called with a NULL buf */ - goto out; - } + + /* Check for receive buffer overflow */ + if (res->acl_len > (xdr->nwords << 2) || + res->acl_len + res->acl_data_offset > xdr->buf->page_len) { + res->acl_flags |= NFS4_ACL_TRUNC; dprintk("NFS: acl reply: attrlen %u > page_len %u\n", attrlen, xdr->nwords << 2); - return -EINVAL; } } else status = -EOPNOTSUPP; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ac7c8ae254f2..be9cf3c7e79e 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -652,7 +652,7 @@ struct nfs_getaclargs { }; /* getxattr ACL interface flags */ -#define NFS4_ACL_LEN_REQUEST 0x0001 /* zero length getxattr buffer */ +#define NFS4_ACL_TRUNC 0x0001 /* ACL was truncated */ struct nfs_getaclres { size_t acl_len; size_t acl_data_offset; -- cgit v1.2.3 From 60e233a56609fd963c59e99bd75c663d63fa91b6 Mon Sep 17 00:00:00 2001 From: Bjørn Mork Date: Sun, 2 Sep 2012 15:41:34 +0200 Subject: kobject: fix oops with "input0: bad kobj_uevent_env content in show_uevent()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fengguang Wu writes: > After the __devinit* removal series, I can still get kernel panic in > show_uevent(). So there are more sources of bug.. > > Debug patch: > > @@ -343,8 +343,11 @@ static ssize_t show_uevent(struct device > goto out; > > /* copy keys to file */ > - for (i = 0; i < env->envp_idx; i++) > + dev_err(dev, "uevent %d env[%d]: %s/.../%s\n", env->buflen, env->envp_idx, top_kobj->name, dev->kobj.name); > + for (i = 0; i < env->envp_idx; i++) { > + printk(KERN_ERR "uevent %d env[%d]: %s\n", (int)count, i, env->envp[i]); > count += sprintf(&buf[count], "%s\n", env->envp[i]); > + } > > Oops message, the env[] is again not properly initilized: > > [ 44.068623] input input0: uevent 61 env[805306368]: input0/.../input0 > [ 44.069552] uevent 0 env[0]: (null) This is a completely different CONFIG_HOTPLUG problem, only demonstrating another reason why CONFIG_HOTPLUG should go away. I had a hard time trying to disable it anyway ;-) The problem this time is lots of code assuming that a call to add_uevent_var() will guarantee that env->buflen > 0. This is not true if CONFIG_HOTPLUG is unset. So things like this end up overwriting env->envp_idx because the array index is -1: if (add_uevent_var(env, "MODALIAS=")) return -ENOMEM; len = input_print_modalias(&env->buf[env->buflen - 1], sizeof(env->buf) - env->buflen, dev, 0); Don't know what the best action is, given that there seem to be a *lot* of this around the kernel. This patch "fixes" the problem for me, but I don't know if it can be considered an appropriate fix. [ It is the correct fix for now, for 3.7 forcing CONFIG_HOTPLUG to always be on is the longterm fix, but it's too late for 3.6 and older kernels to resolve this that way - gregkh ] Reported-by: Fengguang Wu Signed-off-by: Bjørn Mork Tested-by: Fengguang Wu Cc: stable Signed-off-by: Greg Kroah-Hartman --- include/linux/kobject.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index fc615a97e2d3..1e57449395b1 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -224,7 +224,7 @@ static inline int kobject_uevent_env(struct kobject *kobj, static inline __printf(2, 3) int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...) -{ return 0; } +{ return -ENOMEM; } static inline int kobject_action_type(const char *buf, size_t count, enum kobject_action *type) -- cgit v1.2.3 From f39c1bfb5a03e2d255451bff05be0d7255298fa4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 7 Sep 2012 11:08:50 -0400 Subject: SUNRPC: Fix a UDP transport regression Commit 43cedbf0e8dfb9c5610eb7985d5f21263e313802 (SUNRPC: Ensure that we grab the XPRT_LOCK before calling xprt_alloc_slot) is causing hangs in the case of NFS over UDP mounts. Since neither the UDP or the RDMA transport mechanism use dynamic slot allocation, we can skip grabbing the socket lock for those transports. Add a new rpc_xprt_op to allow switching between the TCP and UDP/RDMA case. Note that the NFSv4.1 back channel assigns the slot directly through rpc_run_bc_task, so we can ignore that case. Reported-by: Dick Streefland Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org [>= 3.1] --- include/linux/sunrpc/xprt.h | 3 +++ net/sunrpc/xprt.c | 34 ++++++++++++++++++++-------------- net/sunrpc/xprtrdma/transport.c | 1 + net/sunrpc/xprtsock.c | 3 +++ 4 files changed, 27 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index cff40aa7db62..bf8c49ff7530 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -114,6 +114,7 @@ struct rpc_xprt_ops { void (*set_buffer_size)(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize); int (*reserve_xprt)(struct rpc_xprt *xprt, struct rpc_task *task); void (*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task); + void (*alloc_slot)(struct rpc_xprt *xprt, struct rpc_task *task); void (*rpcbind)(struct rpc_task *task); void (*set_port)(struct rpc_xprt *xprt, unsigned short port); void (*connect)(struct rpc_task *task); @@ -281,6 +282,8 @@ void xprt_connect(struct rpc_task *task); void xprt_reserve(struct rpc_task *task); int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task); int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task); +void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task); +void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task); int xprt_prepare_transmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); void xprt_end_transmit(struct rpc_task *task); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a5a402a7d21f..5d7f61d7559c 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -969,11 +969,11 @@ static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) return false; } -static void xprt_alloc_slot(struct rpc_task *task) +void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req; + spin_lock(&xprt->reserve_lock); if (!list_empty(&xprt->free)) { req = list_entry(xprt->free.next, struct rpc_rqst, rq_list); list_del(&req->rq_list); @@ -994,12 +994,29 @@ static void xprt_alloc_slot(struct rpc_task *task) default: task->tk_status = -EAGAIN; } + spin_unlock(&xprt->reserve_lock); return; out_init_req: task->tk_status = 0; task->tk_rqstp = req; xprt_request_init(task, xprt); + spin_unlock(&xprt->reserve_lock); +} +EXPORT_SYMBOL_GPL(xprt_alloc_slot); + +void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) +{ + /* Note: grabbing the xprt_lock_write() ensures that we throttle + * new slot allocation if the transport is congested (i.e. when + * reconnecting a stream transport or when out of socket write + * buffer space). + */ + if (xprt_lock_write(xprt, task)) { + xprt_alloc_slot(xprt, task); + xprt_release_write(xprt, task); + } } +EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot); static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { @@ -1083,20 +1100,9 @@ void xprt_reserve(struct rpc_task *task) if (task->tk_rqstp != NULL) return; - /* Note: grabbing the xprt_lock_write() here is not strictly needed, - * but ensures that we throttle new slot allocation if the transport - * is congested (e.g. if reconnecting or if we're out of socket - * write buffer space). - */ task->tk_timeout = 0; task->tk_status = -EAGAIN; - if (!xprt_lock_write(xprt, task)) - return; - - spin_lock(&xprt->reserve_lock); - xprt_alloc_slot(task); - spin_unlock(&xprt->reserve_lock); - xprt_release_write(xprt, task); + xprt->ops->alloc_slot(xprt, task); } static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 06cdbff79e4a..5d9202dc7cb1 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -713,6 +713,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) static struct rpc_xprt_ops xprt_rdma_procs = { .reserve_xprt = xprt_rdma_reserve_xprt, .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ + .alloc_slot = xprt_alloc_slot, .release_request = xprt_release_rqst_cong, /* ditto */ .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 400567243f84..a35b8e52e551 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2473,6 +2473,7 @@ static void bc_destroy(struct rpc_xprt *xprt) static struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, + .alloc_slot = xprt_alloc_slot, .rpcbind = xs_local_rpcbind, .set_port = xs_local_set_port, .connect = xs_connect, @@ -2489,6 +2490,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, + .alloc_slot = xprt_alloc_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, @@ -2506,6 +2508,7 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, + .alloc_slot = xprt_lock_and_alloc_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, -- cgit v1.2.3 From a8edc3bf05a3465726afdf635a820761fae0d50b Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Wed, 5 Sep 2012 22:50:48 +0000 Subject: net/mlx4_core: Put Firmware flow steering structures in common header files To allow for usage of the flow steering Firmware structures in more locations over the driver, such as the resource tracker, move them from mcg.c to common header files. Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/mcg.c | 90 +++---------------------------- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 76 ++++++++++++++++++++++++++ include/linux/mlx4/device.h | 2 + 3 files changed, 85 insertions(+), 83 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c index a018ea2a43de..2673f3ba935f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mcg.c +++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c @@ -650,13 +650,6 @@ static int find_entry(struct mlx4_dev *dev, u8 port, return err; } -struct mlx4_net_trans_rule_hw_ctrl { - __be32 ctrl; - __be32 vf_vep_port; - __be32 qpn; - __be32 reserved; -}; - static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl, struct mlx4_net_trans_rule_hw_ctrl *hw) { @@ -680,87 +673,18 @@ static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl, hw->qpn = cpu_to_be32(ctrl->qpn); } -struct mlx4_net_trans_rule_hw_ib { - u8 size; - u8 rsvd1; - __be16 id; - u32 rsvd2; - __be32 qpn; - __be32 qpn_mask; - u8 dst_gid[16]; - u8 dst_gid_msk[16]; -} __packed; - -struct mlx4_net_trans_rule_hw_eth { - u8 size; - u8 rsvd; - __be16 id; - u8 rsvd1[6]; - u8 dst_mac[6]; - u16 rsvd2; - u8 dst_mac_msk[6]; - u16 rsvd3; - u8 src_mac[6]; - u16 rsvd4; - u8 src_mac_msk[6]; - u8 rsvd5; - u8 ether_type_enable; - __be16 ether_type; - __be16 vlan_id_msk; - __be16 vlan_id; -} __packed; - -struct mlx4_net_trans_rule_hw_tcp_udp { - u8 size; - u8 rsvd; - __be16 id; - __be16 rsvd1[3]; - __be16 dst_port; - __be16 rsvd2; - __be16 dst_port_msk; - __be16 rsvd3; - __be16 src_port; - __be16 rsvd4; - __be16 src_port_msk; -} __packed; - -struct mlx4_net_trans_rule_hw_ipv4 { - u8 size; - u8 rsvd; - __be16 id; - __be32 rsvd1; - __be32 dst_ip; - __be32 dst_ip_msk; - __be32 src_ip; - __be32 src_ip_msk; -} __packed; - -struct _rule_hw { - union { - struct { - u8 size; - u8 rsvd; - __be16 id; - }; - struct mlx4_net_trans_rule_hw_eth eth; - struct mlx4_net_trans_rule_hw_ib ib; - struct mlx4_net_trans_rule_hw_ipv4 ipv4; - struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp; - }; +const u16 __sw_id_hw[] = { + [MLX4_NET_TRANS_RULE_ID_ETH] = 0xE001, + [MLX4_NET_TRANS_RULE_ID_IB] = 0xE005, + [MLX4_NET_TRANS_RULE_ID_IPV6] = 0xE003, + [MLX4_NET_TRANS_RULE_ID_IPV4] = 0xE002, + [MLX4_NET_TRANS_RULE_ID_TCP] = 0xE004, + [MLX4_NET_TRANS_RULE_ID_UDP] = 0xE006 }; static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec, struct _rule_hw *rule_hw) { - static const u16 __sw_id_hw[] = { - [MLX4_NET_TRANS_RULE_ID_ETH] = 0xE001, - [MLX4_NET_TRANS_RULE_ID_IB] = 0xE005, - [MLX4_NET_TRANS_RULE_ID_IPV6] = 0xE003, - [MLX4_NET_TRANS_RULE_ID_IPV4] = 0xE002, - [MLX4_NET_TRANS_RULE_ID_TCP] = 0xE004, - [MLX4_NET_TRANS_RULE_ID_UDP] = 0xE006 - }; - static const size_t __rule_hw_sz[] = { [MLX4_NET_TRANS_RULE_ID_ETH] = sizeof(struct mlx4_net_trans_rule_hw_eth), diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index 4d9df8f2a126..dba69d98734a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -690,6 +690,82 @@ struct mlx4_steer { struct list_head steer_entries[MLX4_NUM_STEERS]; }; +struct mlx4_net_trans_rule_hw_ctrl { + __be32 ctrl; + __be32 vf_vep_port; + __be32 qpn; + __be32 reserved; +}; + +struct mlx4_net_trans_rule_hw_ib { + u8 size; + u8 rsvd1; + __be16 id; + u32 rsvd2; + __be32 qpn; + __be32 qpn_mask; + u8 dst_gid[16]; + u8 dst_gid_msk[16]; +} __packed; + +struct mlx4_net_trans_rule_hw_eth { + u8 size; + u8 rsvd; + __be16 id; + u8 rsvd1[6]; + u8 dst_mac[6]; + u16 rsvd2; + u8 dst_mac_msk[6]; + u16 rsvd3; + u8 src_mac[6]; + u16 rsvd4; + u8 src_mac_msk[6]; + u8 rsvd5; + u8 ether_type_enable; + __be16 ether_type; + __be16 vlan_id_msk; + __be16 vlan_id; +} __packed; + +struct mlx4_net_trans_rule_hw_tcp_udp { + u8 size; + u8 rsvd; + __be16 id; + __be16 rsvd1[3]; + __be16 dst_port; + __be16 rsvd2; + __be16 dst_port_msk; + __be16 rsvd3; + __be16 src_port; + __be16 rsvd4; + __be16 src_port_msk; +} __packed; + +struct mlx4_net_trans_rule_hw_ipv4 { + u8 size; + u8 rsvd; + __be16 id; + __be32 rsvd1; + __be32 dst_ip; + __be32 dst_ip_msk; + __be32 src_ip; + __be32 src_ip_msk; +} __packed; + +struct _rule_hw { + union { + struct { + u8 size; + u8 rsvd; + __be16 id; + }; + struct mlx4_net_trans_rule_hw_eth eth; + struct mlx4_net_trans_rule_hw_ib ib; + struct mlx4_net_trans_rule_hw_ipv4 ipv4; + struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp; + }; +}; + struct mlx4_priv { struct mlx4_dev dev; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index bd6c9fcdf2dd..244ba902ab72 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -796,6 +796,8 @@ enum mlx4_net_trans_rule_id { MLX4_NET_TRANS_RULE_NUM, /* should be last */ }; +extern const u16 __sw_id_hw[]; + enum mlx4_net_trans_promisc_mode { MLX4_FS_PROMISC_NONE = 0, MLX4_FS_PROMISC_UPLINK, -- cgit v1.2.3 From 7fb40f87c4195ec1728527f30bc744c47a45b366 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Wed, 5 Sep 2012 22:50:49 +0000 Subject: net/mlx4_core: Add security check / enforcement for flow steering rules set for VMs Since VFs may be mapped to VMs which aren't trusted entities, flow steering rules attached through the wrapper on behalf of VFs must be checked to make sure that their L2 specification relate to MAC address assigned to that VF, and add L2 specification if its missing. Signed-off-by: Hadar Hen Zion Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx4/resource_tracker.c | 116 +++++++++++++++++++++ include/linux/mlx4/device.h | 11 ++ 2 files changed, 127 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 94ceddd17ab2..293c9e820c49 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "mlx4.h" #include "fw.h" @@ -2776,18 +2777,133 @@ ex_put: return err; } +/* + * MAC validation for Flow Steering rules. + * VF can attach rules only with a mac address which is assigned to it. + */ +static int validate_eth_header_mac(int slave, struct _rule_hw *eth_header, + struct list_head *rlist) +{ + struct mac_res *res, *tmp; + __be64 be_mac; + + /* make sure it isn't multicast or broadcast mac*/ + if (!is_multicast_ether_addr(eth_header->eth.dst_mac) && + !is_broadcast_ether_addr(eth_header->eth.dst_mac)) { + list_for_each_entry_safe(res, tmp, rlist, list) { + be_mac = cpu_to_be64(res->mac << 16); + if (!memcmp(&be_mac, eth_header->eth.dst_mac, ETH_ALEN)) + return 0; + } + pr_err("MAC %pM doesn't belong to VF %d, Steering rule rejected\n", + eth_header->eth.dst_mac, slave); + return -EINVAL; + } + return 0; +} + +/* + * In case of missing eth header, append eth header with a MAC address + * assigned to the VF. + */ +static int add_eth_header(struct mlx4_dev *dev, int slave, + struct mlx4_cmd_mailbox *inbox, + struct list_head *rlist, int header_id) +{ + struct mac_res *res, *tmp; + u8 port; + struct mlx4_net_trans_rule_hw_ctrl *ctrl; + struct mlx4_net_trans_rule_hw_eth *eth_header; + struct mlx4_net_trans_rule_hw_ipv4 *ip_header; + struct mlx4_net_trans_rule_hw_tcp_udp *l4_header; + __be64 be_mac = 0; + __be64 mac_msk = cpu_to_be64(MLX4_MAC_MASK << 16); + + ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf; + port = be32_to_cpu(ctrl->vf_vep_port) & 0xff; + eth_header = (struct mlx4_net_trans_rule_hw_eth *)(ctrl + 1); + + /* Clear a space in the inbox for eth header */ + switch (header_id) { + case MLX4_NET_TRANS_RULE_ID_IPV4: + ip_header = + (struct mlx4_net_trans_rule_hw_ipv4 *)(eth_header + 1); + memmove(ip_header, eth_header, + sizeof(*ip_header) + sizeof(*l4_header)); + break; + case MLX4_NET_TRANS_RULE_ID_TCP: + case MLX4_NET_TRANS_RULE_ID_UDP: + l4_header = (struct mlx4_net_trans_rule_hw_tcp_udp *) + (eth_header + 1); + memmove(l4_header, eth_header, sizeof(*l4_header)); + break; + default: + return -EINVAL; + } + list_for_each_entry_safe(res, tmp, rlist, list) { + if (port == res->port) { + be_mac = cpu_to_be64(res->mac << 16); + break; + } + } + if (!be_mac) { + pr_err("Failed adding eth header to FS rule, Can't find matching MAC for port %d .\n", + port); + return -EINVAL; + } + + memset(eth_header, 0, sizeof(*eth_header)); + eth_header->size = sizeof(*eth_header) >> 2; + eth_header->id = cpu_to_be16(__sw_id_hw[MLX4_NET_TRANS_RULE_ID_ETH]); + memcpy(eth_header->dst_mac, &be_mac, ETH_ALEN); + memcpy(eth_header->dst_mac_msk, &mac_msk, ETH_ALEN); + + return 0; + +} + int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, struct mlx4_cmd_mailbox *outbox, struct mlx4_cmd_info *cmd) { + + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *rlist = &tracker->slave_list[slave].res_list[RES_MAC]; int err; + struct mlx4_net_trans_rule_hw_ctrl *ctrl; + struct _rule_hw *rule_header; + int header_id; if (dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) return -EOPNOTSUPP; + ctrl = (struct mlx4_net_trans_rule_hw_ctrl *)inbox->buf; + rule_header = (struct _rule_hw *)(ctrl + 1); + header_id = map_hw_to_sw_id(be16_to_cpu(rule_header->id)); + + switch (header_id) { + case MLX4_NET_TRANS_RULE_ID_ETH: + if (validate_eth_header_mac(slave, rule_header, rlist)) + return -EINVAL; + break; + case MLX4_NET_TRANS_RULE_ID_IPV4: + case MLX4_NET_TRANS_RULE_ID_TCP: + case MLX4_NET_TRANS_RULE_ID_UDP: + pr_warn("Can't attach FS rule without L2 headers, adding L2 header.\n"); + if (add_eth_header(dev, slave, inbox, rlist, header_id)) + return -EINVAL; + vhcr->in_modifier += + sizeof(struct mlx4_net_trans_rule_hw_eth) >> 2; + break; + default: + pr_err("Corrupted mailbox.\n"); + return -EINVAL; + } + err = mlx4_cmd_imm(dev, inbox->dma, &vhcr->out_param, vhcr->in_modifier, 0, MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 244ba902ab72..6e1b0f973a03 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -798,6 +798,17 @@ enum mlx4_net_trans_rule_id { extern const u16 __sw_id_hw[]; +static inline int map_hw_to_sw_id(u16 header_id) +{ + + int i; + for (i = 0; i < MLX4_NET_TRANS_RULE_NUM; i++) { + if (header_id == __sw_id_hw[i]) + return i; + } + return -EINVAL; +} + enum mlx4_net_trans_promisc_mode { MLX4_FS_PROMISC_NONE = 0, MLX4_FS_PROMISC_UPLINK, -- cgit v1.2.3 From c076ada4e4aaf45e1a31ad6de7c6cce36081e045 Mon Sep 17 00:00:00 2001 From: Roland Stigge Date: Wed, 8 Aug 2012 09:42:32 +0200 Subject: i2c: pnx: Fix read transactions of >= 2 bytes On transactions with n>=2 bytes, the controller actually wrongly clocks in n+1 bytes. This is caused by the (wrong) assumption that RFE in the Status Register is 1 iff there is no byte already ordered (via a dummy TX byte). This lead to the implementation of synchronized byte ordering, e.g.: Dummy-TX - RX - Dummy-TX - RX - ... But since RFE actually stays high after some Dummy-TX, it rather looks like: Dummy-TX - Dummy-TX - RX - Dummy-TX - RX - (RX) The last RX byte is clocked in by the bus controller, but ignored by the kernel when filling the userspace buffer. This patch fixes the issue by asking for RX via Dummy-TX asynchronously. Introducing a separate counter for TX bytes. Signed-off-by: Roland Stigge Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-pnx.c | 48 ++++++++++++++++++++++++++------------------ include/linux/i2c-pnx.h | 1 + 2 files changed, 29 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/busses/i2c-pnx.c b/drivers/i2c/busses/i2c-pnx.c index b36e21f9fd53..8488bddfe465 100644 --- a/drivers/i2c/busses/i2c-pnx.c +++ b/drivers/i2c/busses/i2c-pnx.c @@ -291,31 +291,37 @@ static int i2c_pnx_master_rcv(struct i2c_pnx_algo_data *alg_data) * or we didn't 'ask' for it yet. */ if (ioread32(I2C_REG_STS(alg_data)) & mstatus_rfe) { - dev_dbg(&alg_data->adapter.dev, - "%s(): Write dummy data to fill Rx-fifo...\n", - __func__); + /* 'Asking' is done asynchronously, e.g. dummy TX of several + * bytes is done before the first actual RX arrives in FIFO. + * Therefore, ordered bytes (via TX) are counted separately. + */ + if (alg_data->mif.order) { + dev_dbg(&alg_data->adapter.dev, + "%s(): Write dummy data to fill Rx-fifo...\n", + __func__); - if (alg_data->mif.len == 1) { - /* Last byte, do not acknowledge next rcv. */ - val |= stop_bit; + if (alg_data->mif.order == 1) { + /* Last byte, do not acknowledge next rcv. */ + val |= stop_bit; + + /* + * Enable interrupt RFDAIE (data in Rx fifo), + * and disable DRMIE (need data for Tx) + */ + ctl = ioread32(I2C_REG_CTL(alg_data)); + ctl |= mcntrl_rffie | mcntrl_daie; + ctl &= ~mcntrl_drmie; + iowrite32(ctl, I2C_REG_CTL(alg_data)); + } /* - * Enable interrupt RFDAIE (data in Rx fifo), - * and disable DRMIE (need data for Tx) + * Now we'll 'ask' for data: + * For each byte we want to receive, we must + * write a (dummy) byte to the Tx-FIFO. */ - ctl = ioread32(I2C_REG_CTL(alg_data)); - ctl |= mcntrl_rffie | mcntrl_daie; - ctl &= ~mcntrl_drmie; - iowrite32(ctl, I2C_REG_CTL(alg_data)); + iowrite32(val, I2C_REG_TX(alg_data)); + alg_data->mif.order--; } - - /* - * Now we'll 'ask' for data: - * For each byte we want to receive, we must - * write a (dummy) byte to the Tx-FIFO. - */ - iowrite32(val, I2C_REG_TX(alg_data)); - return 0; } @@ -515,6 +521,7 @@ i2c_pnx_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) alg_data->mif.buf = pmsg->buf; alg_data->mif.len = pmsg->len; + alg_data->mif.order = pmsg->len; alg_data->mif.mode = (pmsg->flags & I2C_M_RD) ? I2C_SMBUS_READ : I2C_SMBUS_WRITE; alg_data->mif.ret = 0; @@ -567,6 +574,7 @@ i2c_pnx_xfer(struct i2c_adapter *adap, struct i2c_msg *msgs, int num) /* Cleanup to be sure... */ alg_data->mif.buf = NULL; alg_data->mif.len = 0; + alg_data->mif.order = 0; dev_dbg(&alg_data->adapter.dev, "%s(): exiting, stat = %x\n", __func__, ioread32(I2C_REG_STS(alg_data))); diff --git a/include/linux/i2c-pnx.h b/include/linux/i2c-pnx.h index 1bc74afe7a35..49ed17fdf055 100644 --- a/include/linux/i2c-pnx.h +++ b/include/linux/i2c-pnx.h @@ -22,6 +22,7 @@ struct i2c_pnx_mif { struct timer_list timer; /* Timeout */ u8 * buf; /* Data buffer */ int len; /* Length of data buffer */ + int order; /* RX Bytes to order via TX */ }; struct i2c_pnx_algo_data { -- cgit v1.2.3 From 4b921eda53366b319602351ff4d7256fafa4bd1b Mon Sep 17 00:00:00 2001 From: Karsten Keil Date: Thu, 13 Sep 2012 04:36:20 +0000 Subject: mISDN: Fix wrong usage of flush_work_sync while holding locks It is a bad idea to hold a spinlock and call flush_work_sync. Move the workqueue cleanup outside the spinlock and use cancel_work_sync, on closing the channel this seems to be the more correct function. Remove the never used and constant return value of mISDN_freebchannel. Signed-off-by: Karsten Keil Cc: Signed-off-by: David S. Miller --- drivers/isdn/hardware/mISDN/avmfritz.c | 3 ++- drivers/isdn/hardware/mISDN/mISDNipac.c | 3 ++- drivers/isdn/hardware/mISDN/mISDNisar.c | 3 ++- drivers/isdn/hardware/mISDN/netjet.c | 3 ++- drivers/isdn/hardware/mISDN/w6692.c | 3 ++- drivers/isdn/mISDN/hwchannel.c | 9 ++++----- include/linux/mISDNhw.h | 2 +- 7 files changed, 15 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/isdn/hardware/mISDN/avmfritz.c b/drivers/isdn/hardware/mISDN/avmfritz.c index fa6ca4733725..dceaec821b0e 100644 --- a/drivers/isdn/hardware/mISDN/avmfritz.c +++ b/drivers/isdn/hardware/mISDN/avmfritz.c @@ -857,8 +857,9 @@ avm_bctrl(struct mISDNchannel *ch, u32 cmd, void *arg) switch (cmd) { case CLOSE_CHANNEL: test_and_clear_bit(FLG_OPEN, &bch->Flags); + cancel_work_sync(&bch->workq); spin_lock_irqsave(&fc->lock, flags); - mISDN_freebchannel(bch); + mISDN_clear_bchannel(bch); modehdlc(bch, ISDN_P_NONE); spin_unlock_irqrestore(&fc->lock, flags); ch->protocol = ISDN_P_NONE; diff --git a/drivers/isdn/hardware/mISDN/mISDNipac.c b/drivers/isdn/hardware/mISDN/mISDNipac.c index 752e0825591f..ccd7d851be26 100644 --- a/drivers/isdn/hardware/mISDN/mISDNipac.c +++ b/drivers/isdn/hardware/mISDN/mISDNipac.c @@ -1406,8 +1406,9 @@ hscx_bctrl(struct mISDNchannel *ch, u32 cmd, void *arg) switch (cmd) { case CLOSE_CHANNEL: test_and_clear_bit(FLG_OPEN, &bch->Flags); + cancel_work_sync(&bch->workq); spin_lock_irqsave(hx->ip->hwlock, flags); - mISDN_freebchannel(bch); + mISDN_clear_bchannel(bch); hscx_mode(hx, ISDN_P_NONE); spin_unlock_irqrestore(hx->ip->hwlock, flags); ch->protocol = ISDN_P_NONE; diff --git a/drivers/isdn/hardware/mISDN/mISDNisar.c b/drivers/isdn/hardware/mISDN/mISDNisar.c index be5973ded6d6..182ecf0626c2 100644 --- a/drivers/isdn/hardware/mISDN/mISDNisar.c +++ b/drivers/isdn/hardware/mISDN/mISDNisar.c @@ -1588,8 +1588,9 @@ isar_bctrl(struct mISDNchannel *ch, u32 cmd, void *arg) switch (cmd) { case CLOSE_CHANNEL: test_and_clear_bit(FLG_OPEN, &bch->Flags); + cancel_work_sync(&bch->workq); spin_lock_irqsave(ich->is->hwlock, flags); - mISDN_freebchannel(bch); + mISDN_clear_bchannel(bch); modeisar(ich, ISDN_P_NONE); spin_unlock_irqrestore(ich->is->hwlock, flags); ch->protocol = ISDN_P_NONE; diff --git a/drivers/isdn/hardware/mISDN/netjet.c b/drivers/isdn/hardware/mISDN/netjet.c index c3e3e7686273..9bcade59eb73 100644 --- a/drivers/isdn/hardware/mISDN/netjet.c +++ b/drivers/isdn/hardware/mISDN/netjet.c @@ -812,8 +812,9 @@ nj_bctrl(struct mISDNchannel *ch, u32 cmd, void *arg) switch (cmd) { case CLOSE_CHANNEL: test_and_clear_bit(FLG_OPEN, &bch->Flags); + cancel_work_sync(&bch->workq); spin_lock_irqsave(&card->lock, flags); - mISDN_freebchannel(bch); + mISDN_clear_bchannel(bch); mode_tiger(bc, ISDN_P_NONE); spin_unlock_irqrestore(&card->lock, flags); ch->protocol = ISDN_P_NONE; diff --git a/drivers/isdn/hardware/mISDN/w6692.c b/drivers/isdn/hardware/mISDN/w6692.c index 26a86b846099..335fe6455002 100644 --- a/drivers/isdn/hardware/mISDN/w6692.c +++ b/drivers/isdn/hardware/mISDN/w6692.c @@ -1054,8 +1054,9 @@ w6692_bctrl(struct mISDNchannel *ch, u32 cmd, void *arg) switch (cmd) { case CLOSE_CHANNEL: test_and_clear_bit(FLG_OPEN, &bch->Flags); + cancel_work_sync(&bch->workq); spin_lock_irqsave(&card->lock, flags); - mISDN_freebchannel(bch); + mISDN_clear_bchannel(bch); w6692_mode(bc, ISDN_P_NONE); spin_unlock_irqrestore(&card->lock, flags); ch->protocol = ISDN_P_NONE; diff --git a/drivers/isdn/mISDN/hwchannel.c b/drivers/isdn/mISDN/hwchannel.c index ef34fd40867c..2602be23f341 100644 --- a/drivers/isdn/mISDN/hwchannel.c +++ b/drivers/isdn/mISDN/hwchannel.c @@ -148,17 +148,16 @@ mISDN_clear_bchannel(struct bchannel *ch) ch->next_minlen = ch->init_minlen; ch->maxlen = ch->init_maxlen; ch->next_maxlen = ch->init_maxlen; + skb_queue_purge(&ch->rqueue); + ch->rcount = 0; } EXPORT_SYMBOL(mISDN_clear_bchannel); -int +void mISDN_freebchannel(struct bchannel *ch) { + cancel_work_sync(&ch->workq); mISDN_clear_bchannel(ch); - skb_queue_purge(&ch->rqueue); - ch->rcount = 0; - flush_work_sync(&ch->workq); - return 0; } EXPORT_SYMBOL(mISDN_freebchannel); diff --git a/include/linux/mISDNhw.h b/include/linux/mISDNhw.h index d0752eca9b44..9d96d5d4dfed 100644 --- a/include/linux/mISDNhw.h +++ b/include/linux/mISDNhw.h @@ -183,7 +183,7 @@ extern int mISDN_initbchannel(struct bchannel *, unsigned short, unsigned short); extern int mISDN_freedchannel(struct dchannel *); extern void mISDN_clear_bchannel(struct bchannel *); -extern int mISDN_freebchannel(struct bchannel *); +extern void mISDN_freebchannel(struct bchannel *); extern int mISDN_ctrl_bchannel(struct bchannel *, struct mISDN_ctrl_req *); extern void queue_ch_frame(struct mISDNchannel *, u_int, int, struct sk_buff *); -- cgit v1.2.3