From b80d5ccca3a012e91ca64a2a0b13049163a6a698 Mon Sep 17 00:00:00 2001 From: Haiyan Hu Date: Tue, 10 Sep 2013 11:25:37 +0800 Subject: NVMe: Avoid shift operation when writing cq head doorbell Changes the type of dev->db_stride to unsigned and changes the value stored there to be 1 << the current value. Then there is less calculation to be done at completion time. Signed-off-by: Haiyan Hu Signed-off-by: Matthew Wilcox --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 26ebcf41c213..8119a476cb29 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -80,7 +80,7 @@ struct nvme_dev { struct dma_pool *prp_small_pool; int instance; int queue_count; - int db_stride; + u32 db_stride; u32 ctrl_config; struct msix_entry *entry; struct nvme_bar __iomem *bar; -- cgit v1.2.3 From 320a382746e0ab1304476ea7e986a8d416ab99db Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 23 Oct 2013 13:07:34 -0600 Subject: NVMe: compat SG_IO ioctl For 32-bit versions of sg3-utils running on a 64-bit system. This is mostly a copy from the relevent portions of fs/compat_ioctl.c, with slight modifications for going through block_device_operations. Signed-off-by: Keith Busch Reviewed-by: Vishal Verma [fixed up CONFIG_COMPAT=n build problems] Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 18 +++++- drivers/block/nvme-scsi.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nvme.h | 1 + 3 files changed, 165 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index df3daeb6227f..e44350de8a21 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1568,10 +1568,26 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, } } +#ifdef CONFIG_COMPAT +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + + switch (cmd) { + case SG_IO: + return nvme_sg_io32(ns, arg); + } + return nvme_ioctl(bdev, mode, cmd, arg); +} +#else +#define nvme_compat_ioctl NULL +#endif + static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, }; static void nvme_resubmit_bios(struct nvme_queue *nvmeq) diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 4a4ff4eb8e23..4a0ceb64e269 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -3038,6 +3039,152 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) return retcode; } +#ifdef CONFIG_COMPAT +typedef struct sg_io_hdr32 { + compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ + compat_int_t dxfer_direction; /* [i] data transfer direction */ + unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */ + unsigned char mx_sb_len; /* [i] max length to write to sbp */ + unsigned short iovec_count; /* [i] 0 implies no scatter gather */ + compat_uint_t dxfer_len; /* [i] byte count of data transfer */ + compat_uint_t dxferp; /* [i], [*io] points to data transfer memory + or scatter gather list */ + compat_uptr_t cmdp; /* [i], [*i] points to command to perform */ + compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */ + compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */ + compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */ + compat_int_t pack_id; /* [i->o] unused internally (normally) */ + compat_uptr_t usr_ptr; /* [i->o] unused internally */ + unsigned char status; /* [o] scsi status */ + unsigned char masked_status; /* [o] shifted, masked scsi status */ + unsigned char msg_status; /* [o] messaging level data (optional) */ + unsigned char sb_len_wr; /* [o] byte count actually written to sbp */ + unsigned short host_status; /* [o] errors from host adapter */ + unsigned short driver_status; /* [o] errors from software driver */ + compat_int_t resid; /* [o] dxfer_len - actual_transferred */ + compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */ + compat_uint_t info; /* [o] auxiliary information */ +} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */ + +typedef struct sg_iovec32 { + compat_uint_t iov_base; + compat_uint_t iov_len; +} sg_iovec32_t; + +static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count) +{ + sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1); + sg_iovec32_t __user *iov32 = dxferp; + int i; + + for (i = 0; i < iovec_count; i++) { + u32 base, len; + + if (get_user(base, &iov32[i].iov_base) || + get_user(len, &iov32[i].iov_len) || + put_user(compat_ptr(base), &iov[i].iov_base) || + put_user(len, &iov[i].iov_len)) + return -EFAULT; + } + + if (put_user(iov, &sgio->dxferp)) + return -EFAULT; + return 0; +} + +int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg) +{ + sg_io_hdr32_t __user *sgio32 = (sg_io_hdr32_t __user *)arg; + sg_io_hdr_t __user *sgio; + u16 iovec_count; + u32 data; + void __user *dxferp; + int err; + int interface_id; + + if (get_user(interface_id, &sgio32->interface_id)) + return -EFAULT; + if (interface_id != 'S') + return -EINVAL; + + if (get_user(iovec_count, &sgio32->iovec_count)) + return -EFAULT; + + { + void __user *top = compat_alloc_user_space(0); + void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) + + (iovec_count * sizeof(sg_iovec_t))); + if (new > top) + return -EINVAL; + + sgio = new; + } + + /* Ok, now construct. */ + if (copy_in_user(&sgio->interface_id, &sgio32->interface_id, + (2 * sizeof(int)) + + (2 * sizeof(unsigned char)) + + (1 * sizeof(unsigned short)) + + (1 * sizeof(unsigned int)))) + return -EFAULT; + + if (get_user(data, &sgio32->dxferp)) + return -EFAULT; + dxferp = compat_ptr(data); + if (iovec_count) { + if (sg_build_iovec(sgio, dxferp, iovec_count)) + return -EFAULT; + } else { + if (put_user(dxferp, &sgio->dxferp)) + return -EFAULT; + } + + { + unsigned char __user *cmdp; + unsigned char __user *sbp; + + if (get_user(data, &sgio32->cmdp)) + return -EFAULT; + cmdp = compat_ptr(data); + + if (get_user(data, &sgio32->sbp)) + return -EFAULT; + sbp = compat_ptr(data); + + if (put_user(cmdp, &sgio->cmdp) || + put_user(sbp, &sgio->sbp)) + return -EFAULT; + } + + if (copy_in_user(&sgio->timeout, &sgio32->timeout, + 3 * sizeof(int))) + return -EFAULT; + + if (get_user(data, &sgio32->usr_ptr)) + return -EFAULT; + if (put_user(compat_ptr(data), &sgio->usr_ptr)) + return -EFAULT; + + err = nvme_sg_io(ns, sgio); + if (err >= 0) { + void __user *datap; + + if (copy_in_user(&sgio32->pack_id, &sgio->pack_id, + sizeof(int)) || + get_user(datap, &sgio->usr_ptr) || + put_user((u32)(unsigned long)datap, + &sgio32->usr_ptr) || + copy_in_user(&sgio32->status, &sgio->status, + (4 * sizeof(unsigned char)) + + (2 * sizeof(unsigned short)) + + (3 * sizeof(int)))) + err = -EFAULT; + } + + return err; +} +#endif + int nvme_sg_get_version_num(int __user *ip) { return put_user(sg_version_num, ip); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 8119a476cb29..5bc29197d90e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -165,6 +165,7 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, struct sg_io_hdr; int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); +int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg); int nvme_sg_get_version_num(int __user *ip); #endif /* _LINUX_NVME_H */ -- cgit v1.2.3 From 9a6b94584de1a0467d85b435df9c744c5c45a270 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 10 Dec 2013 13:10:36 -0700 Subject: NVMe: Device resume error handling Adds controller error handling on resume power management. If the device fails to initialize, the device is queued for a reset. If the reset fails, a thread is spawned to remove the pci device. If the device resumes as "busy", the device is responding to admin commands but will not create IO queues. In this case, we need to remove the gendisks and free the IO queues since they can't be used and may be holding bios in their lists. From testing, the dma pools require a pci device so this had to change the pci driver 'remove' to release the dma resources in line with that call instead of after all references to the device are released. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 108 +++++++++++++++++++++++++++++++++++++++------- include/linux/nvme.h | 1 + 2 files changed, 94 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 300766973d76..000bca43c23b 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -58,6 +58,7 @@ module_param(use_threaded_interrupts, int, 0); static DEFINE_SPINLOCK(dev_list_lock); static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; +static struct workqueue_struct *nvme_workq; /* * An NVM Express queue. Each device has at least two (one for admin @@ -1968,7 +1969,6 @@ static int nvme_dev_map(struct nvme_dev *dev) dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) goto disable; - pci_set_drvdata(pdev, dev); dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); if (!dev->bar) goto disable; @@ -1995,9 +1995,9 @@ static void nvme_dev_unmap(struct nvme_dev *dev) if (dev->bar) { iounmap(dev->bar); dev->bar = NULL; + pci_release_regions(dev->pci_dev); } - pci_release_regions(dev->pci_dev); if (pci_is_enabled(dev->pci_dev)) pci_disable_device(dev->pci_dev); } @@ -2085,11 +2085,6 @@ static void nvme_release_instance(struct nvme_dev *dev) static void nvme_free_dev(struct kref *kref) { struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); - nvme_dev_remove(dev); - nvme_dev_shutdown(dev); - nvme_free_queues(dev); - nvme_release_instance(dev); - nvme_release_prp_pools(dev); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2161,6 +2156,70 @@ static int nvme_dev_start(struct nvme_dev *dev) return result; } +static int nvme_remove_dead_ctrl(void *arg) +{ + struct nvme_dev *dev = (struct nvme_dev *)arg; + struct pci_dev *pdev = dev->pci_dev; + + if (pci_get_drvdata(pdev)) + pci_stop_and_remove_bus_device(pdev); + kref_put(&dev->kref, nvme_free_dev); + return 0; +} + +static void nvme_remove_disks(struct work_struct *ws) +{ + int i; + struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); + + nvme_dev_remove(dev); + spin_lock(&dev_list_lock); + for (i = dev->queue_count - 1; i > 0; i--) { + BUG_ON(!dev->queues[i] || !dev->queues[i]->q_suspended); + nvme_free_queue(dev->queues[i]); + dev->queue_count--; + dev->queues[i] = NULL; + } + spin_unlock(&dev_list_lock); +} + +static int nvme_dev_resume(struct nvme_dev *dev) +{ + int ret; + + ret = nvme_dev_start(dev); + if (ret && ret != -EBUSY) + return ret; + if (ret == -EBUSY) { + spin_lock(&dev_list_lock); + INIT_WORK(&dev->reset_work, nvme_remove_disks); + queue_work(nvme_workq, &dev->reset_work); + spin_unlock(&dev_list_lock); + } + return 0; +} + +static void nvme_dev_reset(struct nvme_dev *dev) +{ + nvme_dev_shutdown(dev); + if (nvme_dev_resume(dev)) { + dev_err(&dev->pci_dev->dev, "Device failed to resume\n"); + kref_get(&dev->kref); + if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", + dev->instance))) { + dev_err(&dev->pci_dev->dev, + "Failed to start controller remove task\n"); + kref_put(&dev->kref, nvme_free_dev); + } + } +} + +static void nvme_reset_failed_dev(struct work_struct *ws) +{ + struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); + nvme_dev_reset(dev); +} + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { int result = -ENOMEM; @@ -2180,7 +2239,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&dev->namespaces); dev->pci_dev = pdev; - + pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); if (result) goto free; @@ -2232,7 +2291,19 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void nvme_remove(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); + + spin_lock(&dev_list_lock); + list_del_init(&dev->node); + spin_unlock(&dev_list_lock); + + pci_set_drvdata(pdev, NULL); + flush_work(&dev->reset_work); misc_deregister(&dev->miscdev); + nvme_dev_remove(dev); + nvme_dev_shutdown(dev); + nvme_free_queues(dev); + nvme_release_instance(dev); + nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); } @@ -2256,13 +2327,12 @@ static int nvme_resume(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - int ret; - ret = nvme_dev_start(ndev); - /* XXX: should remove gendisks if resume fails */ - if (ret) - nvme_free_queues(ndev); - return ret; + if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) { + INIT_WORK(&ndev->reset_work, nvme_reset_failed_dev); + queue_work(nvme_workq, &ndev->reset_work); + } + return 0; } static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); @@ -2303,9 +2373,14 @@ static int __init nvme_init(void) if (IS_ERR(nvme_thread)) return PTR_ERR(nvme_thread); + result = -ENOMEM; + nvme_workq = create_singlethread_workqueue("nvme"); + if (!nvme_workq) + goto kill_kthread; + result = register_blkdev(nvme_major, "nvme"); if (result < 0) - goto kill_kthread; + goto kill_workq; else if (result > 0) nvme_major = result; @@ -2316,6 +2391,8 @@ static int __init nvme_init(void) unregister_blkdev: unregister_blkdev(nvme_major, "nvme"); + kill_workq: + destroy_workqueue(nvme_workq); kill_kthread: kthread_stop(nvme_thread); return result; @@ -2325,6 +2402,7 @@ static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); unregister_blkdev(nvme_major, "nvme"); + destroy_workqueue(nvme_workq); kthread_stop(nvme_thread); } diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5bc29197d90e..eed81cc56d7e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -87,6 +87,7 @@ struct nvme_dev { struct list_head namespaces; struct kref kref; struct miscdevice miscdev; + struct work_struct reset_work; char name[12]; char serial[20]; char model[40]; -- cgit v1.2.3 From ec65993443736a5091b68e80ff1734548944a4b8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 14:33:16 -0800 Subject: mm, x86: Account for TLB flushes only when debugging Bisection between 3.11 and 3.12 fingered commit 9824cf97 ("mm: vmstats: tlb flush counters") to cause overhead problems. The counters are undeniably useful but how often do we really need to debug TLB flush related issues? It does not justify taking the penalty everywhere so make it a debugging option. Signed-off-by: Mel Gorman Tested-by: Davidlohr Bueso Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Cc: Hugh Dickins Cc: Alex Shi Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-XzxjntugxuwpxXhcrxqqh53b@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 6 +++--- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/mm/tlb.c | 14 +++++++------- include/linux/vm_event_item.h | 4 +++- include/linux/vmstat.h | 8 ++++++++ mm/vmstat.c | 4 +++- 6 files changed, 26 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index e6d90babc245..04905bfc508b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void) static inline void __flush_tlb_one(unsigned long addr) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr) */ static inline void __flush_tlb_up(void) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); } static inline void flush_tlb_all(void) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb_all(); } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index ce2d0a2c3e4f..0e25a1bc5ab5 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Save MTRR state */ @@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Intel (P6) standard MTRRs */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ae699b3bbac8..05446c1cccfe 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -103,7 +103,7 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; - count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); @@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_start = start; info.flush_end = end; - count_vm_event(NR_TLB_REMOTE_FLUSH); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (is_uv_system()) { unsigned int cpu; @@ -151,7 +151,7 @@ void flush_tlb_current_task(void) preempt_disable(); - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -215,7 +215,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, /* tlb_flushall_shift is on balance point, details in commit log */ if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { if (has_large_page(mm, start, end)) { @@ -224,7 +224,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, } /* flush range by one by one 'invlpg' */ for (addr = start; addr < end; addr += PAGE_SIZE) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -262,7 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) static void do_flush_tlb_all(void *info) { - count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); @@ -270,7 +270,7 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { - count_vm_event(NR_TLB_REMOTE_FLUSH); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); on_each_cpu(do_flush_tlb_all, NULL, 1); } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index c557c6d096de..3a712e2e7d76 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -71,12 +71,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, #endif +#ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ -#endif +#endif /* CONFIG_SMP */ NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ONE, +#endif /* CONFIG_DEBUG_TLBFLUSH */ NR_VM_EVENT_ITEMS }; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index e4b948080d20..80ebba9c2e87 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -83,6 +83,14 @@ static inline void vm_events_fold_cpu(int cpu) #define count_vm_numa_events(x, y) do { (void)(y); } while (0) #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_DEBUG_TLBFLUSH +#define count_vm_tlb_event(x) count_vm_event(x) +#define count_vm_tlb_events(x, y) count_vm_events(x, y) +#else +#define count_vm_tlb_event(x) do {} while (0) +#define count_vm_tlb_events(x, y) do { (void)(y); } while (0) +#endif + #define __count_zone_vm_events(item, zone, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ zone_idx(zone), delta) diff --git a/mm/vmstat.c b/mm/vmstat.c index 72496140ac08..def5dd2fbe61 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -851,12 +851,14 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif +#ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", -#endif +#endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", +#endif /* CONFIG_DEBUG_TLBFLUSH */ #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; -- cgit v1.2.3 From 6e5f52674ff0756e61a8879f6232b9ac33735cba Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sat, 25 Jan 2014 22:36:13 +0200 Subject: spi: spi.h: clarify the documentation of transfer_one Explicitly note the transfer_one and transfer_one_message are mutually exclusive, to make the text a little more newcomers friendly. Signed-off-by: Baruch Siach Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index a1d4ca290862..ad050f0dd39d 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -287,7 +287,10 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * - return 1 if the transfer is still in progress. When * the driver is finished with this transfer it must * call spi_finalize_current_transfer() so the subsystem - * can issue the next transfer + * can issue the next transfer. Note: transfer_one and + * transfer_one_message are mutually exclusive; when both + * are set, the generic subsystem does not call your + * transfer_one callback. * @unprepare_message: undo any work done by prepare_message(). * @cs_gpios: Array of GPIOs to use as chip select lines; one per CS * number. Any individual value may be -ENOENT for CS lines that -- cgit v1.2.3 From e9305331f64ab9e5210baa4307355635c5e849f4 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sat, 25 Jan 2014 22:36:15 +0200 Subject: spi: correct the transfer_one_message documentation wording The transfer_one_message callback handles messages, not transfers. Signed-off-by: Baruch Siach Signed-off-by: Mark Brown --- Documentation/spi/spi-summary | 2 +- include/linux/spi/spi.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/spi/spi-summary b/Documentation/spi/spi-summary index dcaad476789c..7982bcc4d151 100644 --- a/Documentation/spi/spi-summary +++ b/Documentation/spi/spi-summary @@ -543,7 +543,7 @@ SPI MASTER METHODS queuing transfers that arrive in the meantime. When the driver is finished with this message, it must call spi_finalize_current_message() so the subsystem can issue the next - transfer. This may sleep. + message. This may sleep. master->transfer_one(struct spi_master *master, struct spi_device *spi, struct spi_transfer *transfer) diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index ad050f0dd39d..4203c66d8803 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -273,7 +273,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * message while queuing transfers that arrive in the meantime. When the * driver is finished with this message, it must call * spi_finalize_current_message() so the subsystem can issue the next - * transfer + * message * @unprepare_transfer_hardware: there are currently no more messages on the * queue so the subsystem notifies the driver that it may relax the * hardware by issuing this call -- cgit v1.2.3 From d4b4ff8e28b474fac0fbfa9cfc40f88b9e41e380 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 10 Dec 2013 13:10:37 -0700 Subject: NVMe: Schedule reset for failed controllers Schedules a controller reset when it indicates it has a failed status. If the device does not become ready after a reset, the pci device will be scheduled for removal. Signed-off-by: Keith Busch [fixed checkpatch issue] Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 21 +++++++++++++++++++-- include/linux/nvme.h | 1 + 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 000bca43c23b..2f5b9f5f5a21 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -60,6 +60,8 @@ static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; +static void nvme_reset_failed_dev(struct work_struct *ws); + /* * An NVM Express queue. Each device has at least two (one for admin * commands and one for I/O commands). @@ -1612,13 +1614,25 @@ static void nvme_resubmit_bios(struct nvme_queue *nvmeq) static int nvme_kthread(void *data) { - struct nvme_dev *dev; + struct nvme_dev *dev, *next; while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); spin_lock(&dev_list_lock); - list_for_each_entry(dev, &dev_list, node) { + list_for_each_entry_safe(dev, next, &dev_list, node) { int i; + if (readl(&dev->bar->csts) & NVME_CSTS_CFS && + dev->initialized) { + if (work_busy(&dev->reset_work)) + continue; + list_del_init(&dev->node); + dev_warn(&dev->pci_dev->dev, + "Failed status, reset controller\n"); + INIT_WORK(&dev->reset_work, + nvme_reset_failed_dev); + queue_work(nvme_workq, &dev->reset_work); + continue; + } for (i = 0; i < dev->queue_count; i++) { struct nvme_queue *nvmeq = dev->queues[i]; if (!nvmeq) @@ -2006,6 +2020,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) { int i; + dev->initialized = 0; for (i = dev->queue_count - 1; i >= 0; i--) nvme_disable_queue(dev, i); @@ -2196,6 +2211,7 @@ static int nvme_dev_resume(struct nvme_dev *dev) queue_work(nvme_workq, &dev->reset_work); spin_unlock(&dev_list_lock); } + dev->initialized = 1; return 0; } @@ -2269,6 +2285,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto remove; + dev->initialized = 1; kref_init(&dev->kref); return 0; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index eed81cc56d7e..117d877e8be5 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -95,6 +95,7 @@ struct nvme_dev { u32 max_hw_sectors; u32 stripe_size; u16 oncs; + u8 initialized; }; /* -- cgit v1.2.3 From c30341dc3c436cf43508cd44cdfbb3810c38c195 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 10 Dec 2013 13:10:38 -0700 Subject: NVMe: Abort timed out commands Send nvme abort command to io requests that have timed out on an initialized device. If the command is not returned after another timeout, schedule the controller for reset. Signed-off-by: Keith Busch [fix endianness issues] Signed-off-by: Matthew Wilcox --- drivers/block/nvme-core.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++- include/linux/nvme.h | 1 + include/uapi/linux/nvme.h | 11 ++++++++ 3 files changed, 79 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 2f5b9f5f5a21..75049532df96 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -83,6 +83,7 @@ struct nvme_queue { u16 sq_head; u16 sq_tail; u16 cq_head; + u16 qid; u8 cq_phase; u8 cqe_seen; u8 q_suspended; @@ -100,6 +101,7 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(sizeof(struct nvme_features) != 64); BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); BUILD_BUG_ON(sizeof(struct nvme_command) != 64); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); @@ -114,6 +116,7 @@ struct nvme_cmd_info { nvme_completion_fn fn; void *ctx; unsigned long timeout; + int aborted; }; static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) @@ -157,6 +160,7 @@ static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, info[cmdid].fn = handler; info[cmdid].ctx = ctx; info[cmdid].timeout = jiffies + timeout; + info[cmdid].aborted = 0; return cmdid; } @@ -175,6 +179,7 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) #define CMD_CTX_FLUSH (0x318 + CMD_CTX_BASE) +#define CMD_CTX_ABORT (0x31C + CMD_CTX_BASE) static void special_completion(struct nvme_dev *dev, void *ctx, struct nvme_completion *cqe) @@ -183,6 +188,10 @@ static void special_completion(struct nvme_dev *dev, void *ctx, return; if (ctx == CMD_CTX_FLUSH) return; + if (ctx == CMD_CTX_ABORT) { + ++dev->abort_limit; + return; + } if (ctx == CMD_CTX_COMPLETED) { dev_warn(&dev->pci_dev->dev, "completed id %d twice on queue %d\n", @@ -1004,6 +1013,56 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, return nvme_submit_admin_cmd(dev, &c, result); } +/** + * nvme_abort_cmd - Attempt aborting a command + * @cmdid: Command id of a timed out IO + * @queue: The queue with timed out IO + * + * Schedule controller reset if the command was already aborted once before and + * still hasn't been returned to the driver, or if this is the admin queue. + */ +static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) +{ + int a_cmdid; + struct nvme_command cmd; + struct nvme_dev *dev = nvmeq->dev; + struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); + + if (!nvmeq->qid || info[cmdid].aborted) { + if (work_busy(&dev->reset_work)) + return; + list_del_init(&dev->node); + dev_warn(&dev->pci_dev->dev, + "I/O %d QID %d timeout, reset controller\n", cmdid, + nvmeq->qid); + INIT_WORK(&dev->reset_work, nvme_reset_failed_dev); + queue_work(nvme_workq, &dev->reset_work); + return; + } + + if (!dev->abort_limit) + return; + + a_cmdid = alloc_cmdid(dev->queues[0], CMD_CTX_ABORT, special_completion, + ADMIN_TIMEOUT); + if (a_cmdid < 0) + return; + + memset(&cmd, 0, sizeof(cmd)); + cmd.abort.opcode = nvme_admin_abort_cmd; + cmd.abort.cid = cmdid; + cmd.abort.sqid = cpu_to_le16(nvmeq->qid); + cmd.abort.command_id = a_cmdid; + + --dev->abort_limit; + info[cmdid].aborted = 1; + info[cmdid].timeout = jiffies + ADMIN_TIMEOUT; + + dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, + nvmeq->qid); + nvme_submit_cmd(dev->queues[0], &cmd); +} + /** * nvme_cancel_ios - Cancel outstanding I/Os * @queue: The queue to cancel I/Os on @@ -1027,7 +1086,12 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) continue; if (info[cmdid].ctx == CMD_CTX_CANCELLED) continue; - dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d\n", cmdid); + if (timeout && nvmeq->dev->initialized) { + nvme_abort_cmd(cmdid, nvmeq); + continue; + } + dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid, + nvmeq->qid); ctx = cancel_cmdid(nvmeq, cmdid, &fn); fn(nvmeq->dev, ctx, &cqe); } @@ -1119,6 +1183,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; nvmeq->cq_vector = vector; + nvmeq->qid = qid; nvmeq->q_suspended = 1; dev->queue_count++; @@ -1929,6 +1994,7 @@ static int nvme_dev_add(struct nvme_dev *dev) ctrl = mem; nn = le32_to_cpup(&ctrl->nn); dev->oncs = le16_to_cpup(&ctrl->oncs); + dev->abort_limit = ctrl->acl + 1; memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 117d877e8be5..69ae03f6eb15 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -95,6 +95,7 @@ struct nvme_dev { u32 max_hw_sectors; u32 stripe_size; u16 oncs; + u16 abort_limit; u8 initialized; }; diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 989c04e0c563..e5ab62201119 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -350,6 +350,16 @@ struct nvme_delete_queue { __u32 rsvd11[5]; }; +struct nvme_abort_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 sqid; + __u16 cid; + __u32 rsvd11[5]; +}; + struct nvme_download_firmware { __u8 opcode; __u8 flags; @@ -384,6 +394,7 @@ struct nvme_command { struct nvme_download_firmware dlfw; struct nvme_format_cmd format; struct nvme_dsm_cmd dsm; + struct nvme_abort_cmd abort; }; }; -- cgit v1.2.3 From f0276924fa35a3607920a58cf5d878212824b951 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 31 Dec 2013 11:38:50 +0800 Subject: blk-mq: Don't reserve a tag for flush request Reserving a tag (request) for flush to avoid dead lock is a overkill. A tag is valuable resource. We can track the number of flush requests and disallow having too many pending flush requests allocated. With this patch, blk_mq_alloc_request_pinned() could do a busy nop (but not a dead loop) if too many pending requests are allocated and new flush request is allocated. But this should not be a problem, too many pending flush requests are very rare case. I verified this can fix the deadlock caused by too many pending flush requests. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-flush.c | 8 +++++--- block/blk-mq.c | 46 ++++++++++++++++++++++++++++++---------------- include/linux/blk-mq.h | 3 +++ 3 files changed, 38 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/block/blk-flush.c b/block/blk-flush.c index 9288aaf35c21..9143e85226c7 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -284,9 +284,8 @@ static void mq_flush_work(struct work_struct *work) q = container_of(work, struct request_queue, mq_flush_work); - /* We don't need set REQ_FLUSH_SEQ, it's for consistency */ rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, - __GFP_WAIT|GFP_ATOMIC, true); + __GFP_WAIT|GFP_ATOMIC, false); rq->cmd_type = REQ_TYPE_FS; rq->end_io = flush_end_io; @@ -408,8 +407,11 @@ void blk_insert_flush(struct request *rq) /* * @policy now records what operations need to be done. Adjust * REQ_FLUSH and FUA for the driver. + * We keep REQ_FLUSH for mq to track flush requests. For !FUA, + * we never dispatch the request directly. */ - rq->cmd_flags &= ~REQ_FLUSH; + if (rq->cmd_flags & REQ_FUA) + rq->cmd_flags &= ~REQ_FLUSH; if (!(fflags & REQ_FUA)) rq->cmd_flags &= ~REQ_FUA; diff --git a/block/blk-mq.c b/block/blk-mq.c index 57039fcd9c93..9072d0ab184f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -194,9 +194,27 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, } static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, - gfp_t gfp, bool reserved) + gfp_t gfp, bool reserved, + int rw) { - return blk_mq_alloc_rq(hctx, gfp, reserved); + struct request *req; + bool is_flush = false; + /* + * flush need allocate a request, leave at least one request for + * non-flush IO to avoid deadlock + */ + if ((rw & REQ_FLUSH) && !(rw & REQ_FLUSH_SEQ)) { + if (atomic_inc_return(&hctx->pending_flush) >= + hctx->queue_depth - hctx->reserved_tags - 1) { + atomic_dec(&hctx->pending_flush); + return NULL; + } + is_flush = true; + } + req = blk_mq_alloc_rq(hctx, gfp, reserved); + if (!req && is_flush) + atomic_dec(&hctx->pending_flush); + return req; } static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, @@ -209,7 +227,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); - rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); + rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved, rw); if (rq) { blk_mq_rq_ctx_init(q, ctx, rq, rw); break; @@ -272,6 +290,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; + if ((rq->cmd_flags & REQ_FLUSH) && !(rq->cmd_flags & REQ_FLUSH_SEQ)) + atomic_dec(&hctx->pending_flush); + blk_mq_rq_init(hctx, rq); blk_mq_put_tag(hctx->tags, tag); @@ -900,14 +921,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) hctx = q->mq_ops->map_queue(q, ctx->cpu); trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); + rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false, bio->bi_rw); if (likely(rq)) - blk_mq_rq_ctx_init(q, ctx, rq, rw); + blk_mq_rq_ctx_init(q, ctx, rq, bio->bi_rw); else { blk_mq_put_ctx(ctx); trace_block_sleeprq(q, bio, rw); - rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, - false); + rq = blk_mq_alloc_request_pinned(q, bio->bi_rw, + __GFP_WAIT|GFP_ATOMIC, false); ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); } @@ -1184,7 +1205,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q, hctx->queue_num = i; hctx->flags = reg->flags; hctx->queue_depth = reg->queue_depth; + hctx->reserved_tags = reg->reserved_tags; hctx->cmd_size = reg->cmd_size; + atomic_set(&hctx->pending_flush, 0); blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1309,15 +1332,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, reg->queue_depth = BLK_MQ_MAX_DEPTH; } - /* - * Set aside a tag for flush requests. It will only be used while - * another flush request is in progress but outside the driver. - * - * TODO: only allocate if flushes are supported - */ - reg->queue_depth++; - reg->reserved_tags++; - if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) return ERR_PTR(-EINVAL); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 161b23105b1e..1e8f16f65af4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -36,12 +36,15 @@ struct blk_mq_hw_ctx { struct list_head page_list; struct blk_mq_tags *tags; + atomic_t pending_flush; + unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 10 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; unsigned int queue_depth; + unsigned int reserved_tags; unsigned int numa_node; unsigned int cmd_size; /* per-request extra data */ -- cgit v1.2.3 From 0ae89beb283a0db5980d1d4781c7d7be2f2810d6 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Thu, 30 Jan 2014 10:11:28 +0100 Subject: can: add destructor for self generated skbs Self generated skbuffs in net/can/bcm.c are setting a skb->sk reference but no explicit destructor which is enforced since Linux 3.11 with commit 376c7311bdb6 (net: add a temporary sanity check in skb_orphan()). This patch adds some helper functions to make sure that a destructor is properly defined when a sock reference is assigned to a CAN related skb. To create an unshared skb owned by the original sock a common helper function has been introduced to replace open coded functions to create CAN echo skbs. Signed-off-by: Oliver Hartkopp Tested-by: Andre Naujoks Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/can/dev.c | 15 +++------------ drivers/net/can/janz-ican3.c | 18 ++++-------------- drivers/net/can/vcan.c | 9 ++++----- include/linux/can/skb.h | 38 ++++++++++++++++++++++++++++++++++++++ net/can/af_can.c | 3 ++- net/can/bcm.c | 4 ++-- 6 files changed, 53 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 13a909822e25..fc59bc6f040b 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -323,19 +323,10 @@ void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, } if (!priv->echo_skb[idx]) { - struct sock *srcsk = skb->sk; - if (atomic_read(&skb->users) != 1) { - struct sk_buff *old_skb = skb; - - skb = skb_clone(old_skb, GFP_ATOMIC); - kfree_skb(old_skb); - if (!skb) - return; - } else - skb_orphan(skb); - - skb->sk = srcsk; + skb = can_create_echo_skb(skb); + if (!skb) + return; /* make settings for echo to reduce code in irq context */ skb->protocol = htons(ETH_P_CAN); diff --git a/drivers/net/can/janz-ican3.c b/drivers/net/can/janz-ican3.c index e24e6690d672..2124c6790687 100644 --- a/drivers/net/can/janz-ican3.c +++ b/drivers/net/can/janz-ican3.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -1133,20 +1134,9 @@ static void ican3_handle_message(struct ican3_dev *mod, struct ican3_msg *msg) */ static void ican3_put_echo_skb(struct ican3_dev *mod, struct sk_buff *skb) { - struct sock *srcsk = skb->sk; - - if (atomic_read(&skb->users) != 1) { - struct sk_buff *old_skb = skb; - - skb = skb_clone(old_skb, GFP_ATOMIC); - kfree_skb(old_skb); - if (!skb) - return; - } else { - skb_orphan(skb); - } - - skb->sk = srcsk; + skb = can_create_echo_skb(skb); + if (!skb) + return; /* save this skb for tx interrupt echo handling */ skb_queue_tail(&mod->echoq, skb); diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c index 0a2a5ee79a17..4e94057ef5cf 100644 --- a/drivers/net/can/vcan.c +++ b/drivers/net/can/vcan.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -109,25 +110,23 @@ static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) stats->rx_packets++; stats->rx_bytes += cfd->len; } - kfree_skb(skb); + consume_skb(skb); return NETDEV_TX_OK; } /* perform standard echo handling for CAN network interfaces */ if (loop) { - struct sock *srcsk = skb->sk; - skb = skb_share_check(skb, GFP_ATOMIC); + skb = can_create_echo_skb(skb); if (!skb) return NETDEV_TX_OK; /* receive with packet counting */ - skb->sk = srcsk; vcan_rx(skb, dev); } else { /* no looped packets => no counting */ - kfree_skb(skb); + consume_skb(skb); } return NETDEV_TX_OK; } diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index 2f0543f7510c..f9bbbb472663 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -11,7 +11,9 @@ #define CAN_SKB_H #include +#include #include +#include /* * The struct can_skb_priv is used to transport additional information along @@ -42,4 +44,40 @@ static inline void can_skb_reserve(struct sk_buff *skb) skb_reserve(skb, sizeof(struct can_skb_priv)); } +static inline void can_skb_destructor(struct sk_buff *skb) +{ + sock_put(skb->sk); +} + +static inline void can_skb_set_owner(struct sk_buff *skb, struct sock *sk) +{ + if (sk) { + sock_hold(sk); + skb->destructor = can_skb_destructor; + skb->sk = sk; + } +} + +/* + * returns an unshared skb owned by the original sock to be echo'ed back + */ +static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb) +{ + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (likely(nskb)) { + can_skb_set_owner(nskb, skb->sk); + consume_skb(skb); + return nskb; + } else { + kfree_skb(skb); + return NULL; + } + } + + /* we can assume to have an unshared skb with proper owner */ + return skb; +} + #endif /* CAN_SKB_H */ diff --git a/net/can/af_can.c b/net/can/af_can.c index d249874a366d..a27f8aad9e99 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -290,7 +291,7 @@ int can_send(struct sk_buff *skb, int loop) return -ENOMEM; } - newskb->sk = skb->sk; + can_skb_set_owner(newskb, skb->sk); newskb->ip_summed = CHECKSUM_UNNECESSARY; newskb->pkt_type = PACKET_BROADCAST; } diff --git a/net/can/bcm.c b/net/can/bcm.c index 3fc737b214c7..dcb75c0e66c1 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -268,7 +268,7 @@ static void bcm_can_tx(struct bcm_op *op) /* send with loopback */ skb->dev = dev; - skb->sk = op->sk; + can_skb_set_owner(skb, op->sk); can_send(skb, 1); /* update statistics */ @@ -1223,7 +1223,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk) can_skb_prv(skb)->ifindex = dev->ifindex; skb->dev = dev; - skb->sk = sk; + can_skb_set_owner(skb, sk); err = can_send(skb, 1); /* send with loopback */ dev_put(dev); -- cgit v1.2.3 From 17ead6c85c3d0ef57a14d1373f1f1cee2ce60ea8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 1 Feb 2014 14:53:23 -0500 Subject: NFSv4: Fix memory corruption in nfs4_proc_open_confirm nfs41_wake_and_assign_slot() relies on the task->tk_msg.rpc_argp and task->tk_msg.rpc_resp always pointing to the session sequence arguments. nfs4_proc_open_confirm tries to pull a fast one by reusing the open sequence structure, thus causing corruption of the NFSv4 slot table. Cc: stable@vger.kernel.org # 3.12+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 8 ++++---- include/linux/nfs_xdr.h | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 42da6af77587..2da6a698b8f7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1620,15 +1620,15 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, - &data->o_res.seq_res, task); + nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args, + &data->c_res.seq_res, task); } static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_sequence_done(task, &data->o_res.seq_res); + nfs40_sequence_done(task, &data->c_res.seq_res); data->rpc_status = task->tk_status; if (data->rpc_status == 0) { @@ -1686,7 +1686,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) }; int status; - nfs4_init_sequence(&data->o_arg.seq_args, &data->o_res.seq_res, 1); + nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1); kref_get(&data->kref); data->rpc_done = 0; data->rpc_status = 0; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 3ccfcecf8999..b2fb167b2e6d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -379,12 +379,14 @@ struct nfs_openres { * Arguments to the open_confirm call. */ struct nfs_open_confirmargs { + struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; nfs4_stateid * stateid; struct nfs_seqid * seqid; }; struct nfs_open_confirmres { + struct nfs4_sequence_res seq_res; nfs4_stateid stateid; struct nfs_seqid * seqid; }; -- cgit v1.2.3 From 788a4d56ff378bff0b8e685d03a962b36903a149 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 4 Feb 2014 18:33:12 +0100 Subject: drivers: phy: Add support for optional phys Add devm_phy_optional_get and phy_optional_get, which should be used when the phy is optional. They does not return an error when the phy does not exist, rather they returns NULL, which is considered as a valid phy, but results in NOPs when used with the consumer API. Signed-off-by: Andrew Lunn Tested-by: Gregory CLEMENT Acked-by: Kishon Vijay Abraham I Signed-off-by: Jason Cooper --- Documentation/phy.txt | 20 +++++++++++++------- drivers/phy/phy-core.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/phy/phy.h | 14 ++++++++++++++ 3 files changed, 72 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/Documentation/phy.txt b/Documentation/phy.txt index 2e24b993e95f..ebff6ee52441 100644 --- a/Documentation/phy.txt +++ b/Documentation/phy.txt @@ -75,14 +75,20 @@ Before the controller can make use of the PHY, it has to get a reference to it. This framework provides the following APIs to get a reference to the PHY. struct phy *phy_get(struct device *dev, const char *string); +struct phy *phy_optional_get(struct device *dev, const char *string); struct phy *devm_phy_get(struct device *dev, const char *string); - -phy_get and devm_phy_get can be used to get the PHY. In the case of dt boot, -the string arguments should contain the phy name as given in the dt data and -in the case of non-dt boot, it should contain the label of the PHY. -The only difference between the two APIs is that devm_phy_get associates the -device with the PHY using devres on successful PHY get. On driver detach, -release function is invoked on the the devres data and devres data is freed. +struct phy *devm_phy_optional_get(struct device *dev, const char *string); + +phy_get, phy_optional_get, devm_phy_get and devm_phy_optional_get can +be used to get the PHY. In the case of dt boot, the string arguments +should contain the phy name as given in the dt data and in the case of +non-dt boot, it should contain the label of the PHY. The two +devm_phy_get associates the device with the PHY using devres on +successful PHY get. On driver detach, release function is invoked on +the the devres data and devres data is freed. phy_optional_get and +devm_phy_optional_get should be used when the phy is optional. These +two functions will never return -ENODEV, but instead returns NULL when +the phy cannot be found. It should be noted that NULL is a valid phy reference. All phy consumer calls on the NULL phy become NOPs. That is the release calls, diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index a9cdeee20d91..5f5b0f4be5be 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -425,6 +425,27 @@ struct phy *phy_get(struct device *dev, const char *string) } EXPORT_SYMBOL_GPL(phy_get); +/** + * phy_optional_get() - lookup and obtain a reference to an optional phy. + * @dev: device that requests this phy + * @string: the phy name as given in the dt data or the name of the controller + * port for non-dt case + * + * Returns the phy driver, after getting a refcount to it; or + * NULL if there is no such phy. The caller is responsible for + * calling phy_put() to release that count. + */ +struct phy *phy_optional_get(struct device *dev, const char *string) +{ + struct phy *phy = phy_get(dev, string); + + if (PTR_ERR(phy) == -ENODEV) + phy = NULL; + + return phy; +} +EXPORT_SYMBOL_GPL(phy_optional_get); + /** * devm_phy_get() - lookup and obtain a reference to a phy. * @dev: device that requests this phy @@ -455,6 +476,30 @@ struct phy *devm_phy_get(struct device *dev, const char *string) } EXPORT_SYMBOL_GPL(devm_phy_get); +/** + * devm_phy_optional_get() - lookup and obtain a reference to an optional phy. + * @dev: device that requests this phy + * @string: the phy name as given in the dt data or phy device name + * for non-dt case + * + * Gets the phy using phy_get(), and associates a device with it using + * devres. On driver detach, release function is invoked on the devres + * data, then, devres data is freed. This differs to devm_phy_get() in + * that if the phy does not exist, it is not considered an error and + * -ENODEV will not be returned. Instead the NULL phy is returned, + * which can be passed to all other phy consumer calls. + */ +struct phy *devm_phy_optional_get(struct device *dev, const char *string) +{ + struct phy *phy = devm_phy_get(dev, string); + + if (PTR_ERR(phy) == -ENODEV) + phy = NULL; + + return phy; +} +EXPORT_SYMBOL_GPL(devm_phy_optional_get); + /** * phy_create() - create a new phy * @dev: device that is creating the new phy diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index e273e5ac19c9..3f83459dbb20 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -146,7 +146,9 @@ static inline void phy_set_bus_width(struct phy *phy, int bus_width) phy->attrs.bus_width = bus_width; } struct phy *phy_get(struct device *dev, const char *string); +struct phy *phy_optional_get(struct device *dev, const char *string); struct phy *devm_phy_get(struct device *dev, const char *string); +struct phy *devm_phy_optional_get(struct device *dev, const char *string); void phy_put(struct phy *phy); void devm_phy_put(struct device *dev, struct phy *phy); struct phy *of_phy_simple_xlate(struct device *dev, @@ -232,11 +234,23 @@ static inline struct phy *phy_get(struct device *dev, const char *string) return ERR_PTR(-ENOSYS); } +static inline struct phy *phy_optional_get(struct device *dev, + const char *string) +{ + return ERR_PTR(-ENOSYS); +} + static inline struct phy *devm_phy_get(struct device *dev, const char *string) { return ERR_PTR(-ENOSYS); } +static inline struct phy *devm_phy_optional_get(struct device *dev, + const char *string) +{ + return ERR_PTR(-ENOSYS); +} + static inline void phy_put(struct phy *phy) { } -- cgit v1.2.3 From 662372e42e46d9bbfcb83e1cce81f6b33cebaddd Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 3 Feb 2014 08:53:44 -0600 Subject: of: restructure for_each macros to fix compile warnings Commit 00b2c76a6a "include/linux/of.h: make for_each_child_of_node() reference its args when CONFIG_OF=n" fixed warnings for unused variables, but introduced variable "used uninitialized" warnings. Simply initializing the variables would result in "set but not used" warnings with W=1. Fix both types of warnings by making all the for_each macros unconditional and rely on the dummy static inline functions to initialize and reference any variables. Reported-by: Geert Uytterhoeven Cc: David Howells Signed-off-by: Rob Herring Acked-by: Grant Likely --- include/linux/of.h | 153 +++++++++++++++++++++++++++++------------------------ 1 file changed, 84 insertions(+), 69 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index 70c64ba17fa5..435cb995904d 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -169,35 +169,15 @@ static inline const char *of_node_full_name(const struct device_node *np) extern struct device_node *of_find_node_by_name(struct device_node *from, const char *name); -#define for_each_node_by_name(dn, name) \ - for (dn = of_find_node_by_name(NULL, name); dn; \ - dn = of_find_node_by_name(dn, name)) extern struct device_node *of_find_node_by_type(struct device_node *from, const char *type); -#define for_each_node_by_type(dn, type) \ - for (dn = of_find_node_by_type(NULL, type); dn; \ - dn = of_find_node_by_type(dn, type)) extern struct device_node *of_find_compatible_node(struct device_node *from, const char *type, const char *compat); -#define for_each_compatible_node(dn, type, compatible) \ - for (dn = of_find_compatible_node(NULL, type, compatible); dn; \ - dn = of_find_compatible_node(dn, type, compatible)) extern struct device_node *of_find_matching_node_and_match( struct device_node *from, const struct of_device_id *matches, const struct of_device_id **match); -static inline struct device_node *of_find_matching_node( - struct device_node *from, - const struct of_device_id *matches) -{ - return of_find_matching_node_and_match(from, matches, NULL); -} -#define for_each_matching_node(dn, matches) \ - for (dn = of_find_matching_node(NULL, matches); dn; \ - dn = of_find_matching_node(dn, matches)) -#define for_each_matching_node_and_match(dn, matches, match) \ - for (dn = of_find_matching_node_and_match(NULL, matches, match); \ - dn; dn = of_find_matching_node_and_match(dn, matches, match)) + extern struct device_node *of_find_node_by_path(const char *path); extern struct device_node *of_find_node_by_phandle(phandle handle); extern struct device_node *of_get_parent(const struct device_node *node); @@ -209,43 +189,11 @@ extern struct device_node *of_get_next_available_child( extern struct device_node *of_get_child_by_name(const struct device_node *node, const char *name); -#define for_each_child_of_node(parent, child) \ - for (child = of_get_next_child(parent, NULL); child != NULL; \ - child = of_get_next_child(parent, child)) - -#define for_each_available_child_of_node(parent, child) \ - for (child = of_get_next_available_child(parent, NULL); child != NULL; \ - child = of_get_next_available_child(parent, child)) - -static inline int of_get_child_count(const struct device_node *np) -{ - struct device_node *child; - int num = 0; - - for_each_child_of_node(np, child) - num++; - - return num; -} - -static inline int of_get_available_child_count(const struct device_node *np) -{ - struct device_node *child; - int num = 0; - - for_each_available_child_of_node(np, child) - num++; - - return num; -} /* cache lookup */ extern struct device_node *of_find_next_cache_node(const struct device_node *); extern struct device_node *of_find_node_with_property( struct device_node *from, const char *prop_name); -#define for_each_node_with_property(dn, prop_name) \ - for (dn = of_find_node_with_property(NULL, prop_name); dn; \ - dn = of_find_node_with_property(dn, prop_name)) extern struct property *of_find_property(const struct device_node *np, const char *name, @@ -367,42 +315,53 @@ static inline struct device_node *of_find_node_by_name(struct device_node *from, return NULL; } -static inline struct device_node *of_get_parent(const struct device_node *node) +static inline struct device_node *of_find_node_by_type(struct device_node *from, + const char *type) { return NULL; } -static inline bool of_have_populated_dt(void) +static inline struct device_node *of_find_matching_node_and_match( + struct device_node *from, + const struct of_device_id *matches, + const struct of_device_id **match) { - return false; + return NULL; } -/* Kill an unused variable warning on a device_node pointer */ -static inline void __of_use_dn(const struct device_node *np) +static inline struct device_node *of_get_parent(const struct device_node *node) { + return NULL; } -#define for_each_child_of_node(parent, child) \ - while (__of_use_dn(parent), __of_use_dn(child), 0) +static inline struct device_node *of_get_next_child( + const struct device_node *node, struct device_node *prev) +{ + return NULL; +} -#define for_each_available_child_of_node(parent, child) \ - while (0) +static inline struct device_node *of_get_next_available_child( + const struct device_node *node, struct device_node *prev) +{ + return NULL; +} -static inline struct device_node *of_get_child_by_name( - const struct device_node *node, - const char *name) +static inline struct device_node *of_find_node_with_property( + struct device_node *from, const char *prop_name) { return NULL; } -static inline int of_get_child_count(const struct device_node *np) +static inline bool of_have_populated_dt(void) { - return 0; + return false; } -static inline int of_get_available_child_count(const struct device_node *np) +static inline struct device_node *of_get_child_by_name( + const struct device_node *node, + const char *name) { - return 0; + return NULL; } static inline int of_device_is_compatible(const struct device_node *device, @@ -569,6 +528,13 @@ extern int of_node_to_nid(struct device_node *np); static inline int of_node_to_nid(struct device_node *device) { return 0; } #endif +static inline struct device_node *of_find_matching_node( + struct device_node *from, + const struct of_device_id *matches) +{ + return of_find_matching_node_and_match(from, matches, NULL); +} + /** * of_property_read_bool - Findfrom a property * @np: device node from which the property value is to be read. @@ -618,6 +584,55 @@ static inline int of_property_read_u32(const struct device_node *np, s; \ s = of_prop_next_string(prop, s)) +#define for_each_node_by_name(dn, name) \ + for (dn = of_find_node_by_name(NULL, name); dn; \ + dn = of_find_node_by_name(dn, name)) +#define for_each_node_by_type(dn, type) \ + for (dn = of_find_node_by_type(NULL, type); dn; \ + dn = of_find_node_by_type(dn, type)) +#define for_each_compatible_node(dn, type, compatible) \ + for (dn = of_find_compatible_node(NULL, type, compatible); dn; \ + dn = of_find_compatible_node(dn, type, compatible)) +#define for_each_matching_node(dn, matches) \ + for (dn = of_find_matching_node(NULL, matches); dn; \ + dn = of_find_matching_node(dn, matches)) +#define for_each_matching_node_and_match(dn, matches, match) \ + for (dn = of_find_matching_node_and_match(NULL, matches, match); \ + dn; dn = of_find_matching_node_and_match(dn, matches, match)) + +#define for_each_child_of_node(parent, child) \ + for (child = of_get_next_child(parent, NULL); child != NULL; \ + child = of_get_next_child(parent, child)) +#define for_each_available_child_of_node(parent, child) \ + for (child = of_get_next_available_child(parent, NULL); child != NULL; \ + child = of_get_next_available_child(parent, child)) + +#define for_each_node_with_property(dn, prop_name) \ + for (dn = of_find_node_with_property(NULL, prop_name); dn; \ + dn = of_find_node_with_property(dn, prop_name)) + +static inline int of_get_child_count(const struct device_node *np) +{ + struct device_node *child; + int num = 0; + + for_each_child_of_node(np, child) + num++; + + return num; +} + +static inline int of_get_available_child_count(const struct device_node *np) +{ + struct device_node *child; + int num = 0; + + for_each_available_child_of_node(np, child) + num++; + + return num; +} + #if defined(CONFIG_PROC_FS) && defined(CONFIG_PROC_DEVICETREE) extern void proc_device_tree_add_node(struct device_node *, struct proc_dir_entry *); extern void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop); -- cgit v1.2.3 From 1db73ae39a97797b7d929eca2af469f82fbb337a Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 3 Feb 2014 09:27:40 +0100 Subject: of/device: Nullify match table in of_match_device() for CONFIG_OF=n If the of_device_id table inside a device driver is protected by #ifdef CONFIG_OF, the driver still has to provide a dummy declaration of the table, or wrap it inside of_match_ptr(), when calling of_match_device() in the CONFIG_OF=n case, else the driver fails to compile with e.g. drivers/spi/spi-rspi.c: In function 'rspi_probe': drivers/spi/spi-rspi.c:1203:26: error: 'rspi_of_match' undeclared (first use in this function) drivers/spi/spi-rspi.c:1203:26: note: each undeclared identifier is reported only once for each function it appears in Make of_match_device() nullify the table pointer if CONFIG_OF=n to fix this. Reported-by: Yoshihiro Shimoda Signed-off-by: Geert Uytterhoeven Signed-off-by: Rob Herring --- include/linux/of_device.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/of_device.h b/include/linux/of_device.h index 8d7dd6768cb7..ef370210ffb2 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -78,11 +78,13 @@ static inline int of_device_uevent_modalias(struct device *dev, static inline void of_device_node_put(struct device *dev) { } -static inline const struct of_device_id *of_match_device( +static inline const struct of_device_id *__of_match_device( const struct of_device_id *matches, const struct device *dev) { return NULL; } +#define of_match_device(matches, dev) \ + __of_match_device(of_match_ptr(matches), (dev)) static inline struct device_node *of_cpu_device_node_get(int cpu) { -- cgit v1.2.3 From c4ad8f98bef77c7356aa6a9ad9188a6acc6b849d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Feb 2014 12:54:53 -0800 Subject: execve: use 'struct filename *' for executable name passing This changes 'do_execve()' to get the executable name as a 'struct filename', and to free it when it is done. This is what the normal users want, and it simplifies and streamlines their error handling. The controlled lifetime of the executable name also fixes a use-after-free problem with the trace_sched_process_exec tracepoint: the lifetime of the passed-in string for kernel users was not at all obvious, and the user-mode helper code used UMH_WAIT_EXEC to serialize the pathname allocation lifetime with the execve() having finished, which in turn meant that the trace point that happened after mm_release() of the old process VM ended up using already free'd memory. To solve the kernel string lifetime issue, this simply introduces "getname_kernel()" that works like the normal user-space getname() function, except with the source coming from kernel memory. As Oleg points out, this also means that we could drop the tcomm[] array from 'struct linux_binprm', since the pathname lifetime now covers setup_new_exec(). That would be a separate cleanup. Reported-by: Igor Zhbanov Tested-by: Steven Rostedt Cc: Oleg Nesterov Cc: Al Viro Signed-off-by: Linus Torvalds --- arch/parisc/hpux/fs.c | 15 +-------------- fs/exec.c | 45 +++++++++++++++++++++------------------------ fs/namei.c | 30 ++++++++++++++++++++++++++++++ include/linux/binfmts.h | 1 - include/linux/fs.h | 1 + include/linux/sched.h | 3 ++- init/main.c | 2 +- kernel/auditsc.c | 2 +- kernel/kmod.c | 2 +- 9 files changed, 58 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c index 88d0962de65a..2bedafea3d94 100644 --- a/arch/parisc/hpux/fs.c +++ b/arch/parisc/hpux/fs.c @@ -33,22 +33,9 @@ int hpux_execve(struct pt_regs *regs) { - int error; - struct filename *filename; - - filename = getname((const char __user *) regs->gr[26]); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - - error = do_execve(filename->name, + return do_execve(getname((const char __user *) regs->gr[26]), (const char __user *const __user *) regs->gr[25], (const char __user *const __user *) regs->gr[24]); - - putname(filename); - -out: - return error; } struct hpux_dirent { diff --git a/fs/exec.c b/fs/exec.c index e1529b4c79b1..3d78fccdd723 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -748,11 +748,10 @@ EXPORT_SYMBOL(setup_arg_pages); #endif /* CONFIG_MMU */ -struct file *open_exec(const char *name) +static struct file *do_open_exec(struct filename *name) { struct file *file; int err; - struct filename tmp = { .name = name }; static const struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC | MAY_OPEN, @@ -760,7 +759,7 @@ struct file *open_exec(const char *name) .lookup_flags = LOOKUP_FOLLOW, }; - file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags); + file = do_filp_open(AT_FDCWD, name, &open_exec_flags); if (IS_ERR(file)) goto out; @@ -784,6 +783,12 @@ exit: fput(file); return ERR_PTR(err); } + +struct file *open_exec(const char *name) +{ + struct filename tmp = { .name = name }; + return do_open_exec(&tmp); +} EXPORT_SYMBOL(open_exec); int kernel_read(struct file *file, loff_t offset, @@ -1162,7 +1167,7 @@ int prepare_bprm_creds(struct linux_binprm *bprm) return -ENOMEM; } -void free_bprm(struct linux_binprm *bprm) +static void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { @@ -1432,7 +1437,7 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execve_common(const char *filename, +static int do_execve_common(struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp) { @@ -1441,6 +1446,9 @@ static int do_execve_common(const char *filename, struct files_struct *displaced; int retval; + if (IS_ERR(filename)) + return PTR_ERR(filename); + /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs @@ -1473,7 +1481,7 @@ static int do_execve_common(const char *filename, check_unsafe_exec(bprm); current->in_execve = 1; - file = open_exec(filename); + file = do_open_exec(filename); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1481,8 +1489,7 @@ static int do_execve_common(const char *filename, sched_exec(); bprm->file = file; - bprm->filename = filename; - bprm->interp = filename; + bprm->filename = bprm->interp = filename->name; retval = bprm_mm_init(bprm); if (retval) @@ -1523,6 +1530,7 @@ static int do_execve_common(const char *filename, acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); + putname(filename); if (displaced) put_files_struct(displaced); return retval; @@ -1544,10 +1552,11 @@ out_files: if (displaced) reset_files_struct(displaced); out_ret: + putname(filename); return retval; } -int do_execve(const char *filename, +int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { @@ -1557,7 +1566,7 @@ int do_execve(const char *filename, } #ifdef CONFIG_COMPAT -static int compat_do_execve(const char *filename, +static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { @@ -1607,25 +1616,13 @@ SYSCALL_DEFINE3(execve, const char __user *const __user *, argv, const char __user *const __user *, envp) { - struct filename *path = getname(filename); - int error = PTR_ERR(path); - if (!IS_ERR(path)) { - error = do_execve(path->name, argv, envp); - putname(path); - } - return error; + return do_execve(getname(filename), argv, envp); } #ifdef CONFIG_COMPAT asmlinkage long compat_sys_execve(const char __user * filename, const compat_uptr_t __user * argv, const compat_uptr_t __user * envp) { - struct filename *path = getname(filename); - int error = PTR_ERR(path); - if (!IS_ERR(path)) { - error = compat_do_execve(path->name, argv, envp); - putname(path); - } - return error; + return compat_do_execve(getname(filename), argv, envp); } #endif diff --git a/fs/namei.c b/fs/namei.c index d580df2e6804..385f7817bfcc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -196,6 +196,7 @@ recopy: goto error; result->uptr = filename; + result->aname = NULL; audit_getname(result); return result; @@ -210,6 +211,35 @@ getname(const char __user * filename) return getname_flags(filename, 0, NULL); } +/* + * The "getname_kernel()" interface doesn't do pathnames longer + * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user. + */ +struct filename * +getname_kernel(const char * filename) +{ + struct filename *result; + char *kname; + int len; + + len = strlen(filename); + if (len >= EMBEDDED_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + result = __getname(); + if (unlikely(!result)) + return ERR_PTR(-ENOMEM); + + kname = (char *)result + sizeof(*result); + result->name = kname; + result->uptr = NULL; + result->aname = NULL; + result->separate = false; + + strlcpy(kname, filename, EMBEDDED_NAME_MAX); + return result; +} + #ifdef CONFIG_AUDITSYSCALL void putname(struct filename *name) { diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index fd8bf3219ef7..b4a745d7d9a9 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -115,7 +115,6 @@ extern int copy_strings_kernel(int argc, const char *const *argv, extern int prepare_bprm_creds(struct linux_binprm *bprm); extern void install_exec_creds(struct linux_binprm *bprm); extern void set_binfmt(struct linux_binfmt *new); -extern void free_bprm(struct linux_binprm *); extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t); #endif /* _LINUX_BINFMTS_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 09f553c59813..d79678c188ad 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2079,6 +2079,7 @@ extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname(const char __user *); +extern struct filename *getname_kernel(const char *); enum { FILE_CREATED = 1, diff --git a/include/linux/sched.h b/include/linux/sched.h index 68a0e84463a0..a781dec1cd0b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -128,6 +128,7 @@ struct bio_list; struct fs_struct; struct perf_event_context; struct blk_plug; +struct filename; /* * List of flags we want to share for kernel threads, @@ -2311,7 +2312,7 @@ extern void do_group_exit(int); extern int allow_signal(int); extern int disallow_signal(int); -extern int do_execve(const char *, +extern int do_execve(struct filename *, const char __user * const __user *, const char __user * const __user *); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); diff --git a/init/main.c b/init/main.c index 2fd9cef70ee8..eb03090cdced 100644 --- a/init/main.c +++ b/init/main.c @@ -812,7 +812,7 @@ void __init load_default_modules(void) static int run_init_process(const char *init_filename) { argv_init[0] = init_filename; - return do_execve(init_filename, + return do_execve(getname_kernel(init_filename), (const char __user *const __user *)argv_init, (const char __user *const __user *)envp_init); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 10176cd5956a..7aef2f4b6c64 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1719,7 +1719,7 @@ void audit_putname(struct filename *name) struct audit_context *context = current->audit_context; BUG_ON(!context); - if (!context->in_syscall) { + if (!name->aname || !context->in_syscall) { #if AUDIT_DEBUG == 2 printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n", __FILE__, __LINE__, context->serial, name); diff --git a/kernel/kmod.c b/kernel/kmod.c index b086006c59e7..6b375af4958d 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -239,7 +239,7 @@ static int ____call_usermodehelper(void *data) commit_creds(new); - retval = do_execve(sub_info->path, + retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); if (!retval) -- cgit v1.2.3 From a3485d088516be4c2ae2890e8ad64321481f2857 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Tue, 4 Feb 2014 13:42:10 +0100 Subject: gpio: consumer.h: Move forward declarations outside #ifdef Make sure that the forward declared structs in gpio/consumer.h are also visible on the else branch of the CONFIG_GPIOLIB #ifdef. Fixes the following warnings and their associated errors when CONFIG_GPIOLIB is not selected: include/linux/gpio/consumer.h:67:14: warning: 'struct device' declared inside parameter list include/linux/gpio/consumer.h:67:14: warning: its scope is only this definition or declaration, which is probably not what you want [...] Signed-off-by: Lars-Peter Clausen Reviewed-by: Alexandre Courbot Signed-off-by: Linus Walleij --- include/linux/gpio/consumer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 4d34dbbbad4d..7a8144fef406 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -4,8 +4,6 @@ #include #include -#ifdef CONFIG_GPIOLIB - struct device; struct gpio_chip; @@ -18,6 +16,8 @@ struct gpio_chip; */ struct gpio_desc; +#ifdef CONFIG_GPIOLIB + /* Acquire and dispose GPIOs */ struct gpio_desc *__must_check gpiod_get(struct device *dev, const char *con_id); -- cgit v1.2.3 From 579f82901f6f41256642936d7e632f3979ad76d4 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 6 Feb 2014 12:04:21 -0800 Subject: swap: add a simple detector for inappropriate swapin readahead This is a patch to improve swap readahead algorithm. It's from Hugh and I slightly changed it. Hugh's original changelog: swapin readahead does a blind readahead, whether or not the swapin is sequential. This may be ok on harddisk, because large reads have relatively small costs, and if the readahead pages are unneeded they can be reclaimed easily - though, what if their allocation forced reclaim of useful pages? But on SSD devices large reads are more expensive than small ones: if the readahead pages are unneeded, reading them in caused significant overhead. This patch adds very simplistic random read detection. Stealing the PageReadahead technique from Konstantin Khlebnikov's patch, avoiding the vma/anon_vma sophistications of Shaohua Li's patch, swapin_nr_pages() simply looks at readahead's current success rate, and narrows or widens its readahead window accordingly. There is little science to its heuristic: it's about as stupid as can be whilst remaining effective. The table below shows elapsed times (in centiseconds) when running a single repetitive swapping load across a 1000MB mapping in 900MB ram with 1GB swap (the harddisk tests had taken painfully too long when I used mem=500M, but SSD shows similar results for that). Vanilla is the 3.6-rc7 kernel on which I started; Shaohua denotes his Sep 3 patch in mmotm and linux-next; HughOld denotes my Oct 1 patch which Shaohua showed to be defective; HughNew this Nov 14 patch, with page_cluster as usual at default of 3 (8-page reads); HughPC4 this same patch with page_cluster 4 (16-page reads); HughPC0 with page_cluster 0 (1-page reads: no readahead). HDD for swapping to harddisk, SSD for swapping to VertexII SSD. Seq for sequential access to the mapping, cycling five times around; Rand for the same number of random touches. Anon for a MAP_PRIVATE anon mapping; Shmem for a MAP_SHARED anon mapping, equivalent to tmpfs. One weakness of Shaohua's vma/anon_vma approach was that it did not optimize Shmem: seen below. Konstantin's approach was perhaps mistuned, 50% slower on Seq: did not compete and is not shown below. HDD Vanilla Shaohua HughOld HughNew HughPC4 HughPC0 Seq Anon 73921 76210 75611 76904 78191 121542 Seq Shmem 73601 73176 73855 72947 74543 118322 Rand Anon 895392 831243 871569 845197 846496 841680 Rand Shmem 1058375 1053486 827935 764955 764376 756489 SSD Vanilla Shaohua HughOld HughNew HughPC4 HughPC0 Seq Anon 24634 24198 24673 25107 21614 70018 Seq Shmem 24959 24932 25052 25703 22030 69678 Rand Anon 43014 26146 28075 25989 26935 25901 Rand Shmem 45349 45215 28249 24268 24138 24332 These tests are, of course, two extremes of a very simple case: under heavier mixed loads I've not yet observed any consistent improvement or degradation, and wider testing would be welcome. Shaohua Li: Test shows Vanilla is slightly better in sequential workload than Hugh's patch. I observed with Hugh's patch sometimes the readahead size is shrinked too fast (from 8 to 1 immediately) in sequential workload if there is no hit. And in such case, continuing doing readahead is good actually. I don't prepare a sophisticated algorithm for the sequential workload because so far we can't guarantee sequential accessed pages are swap out sequentially. So I slightly change Hugh's heuristic - don't shrink readahead size too fast. Here is my test result (unit second, 3 runs average): Vanilla Hugh New Seq 356 370 360 Random 4525 2447 2444 Attached graph is the swapin/swapout throughput I collected with 'vmstat 2'. The first part is running a random workload (till around 1200 of the x-axis) and the second part is running a sequential workload. swapin and swapout throughput are almost identical in steady state in both workloads. These are expected behavior. while in Vanilla, swapin is much bigger than swapout especially in random workload (because wrong readahead). Original patches by: Shaohua Li and Konstantin Khlebnikov. [fengguang.wu@intel.com: swapin_nr_pages() can be static] Signed-off-by: Hugh Dickins Signed-off-by: Shaohua Li Signed-off-by: Fengguang Wu Cc: Rik van Riel Cc: Wu Fengguang Cc: Minchan Kim Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 4 +-- mm/swap_state.c | 63 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 62 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e464b4e987e8..d1fe1a761047 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -228,9 +228,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) PAGEFLAG(MappedToDisk, mappedtodisk) -/* PG_readahead is only used for file reads; PG_reclaim is only for writes */ +/* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) -PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */ +PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) #ifdef CONFIG_HIGHMEM /* diff --git a/mm/swap_state.c b/mm/swap_state.c index 98e85e9c2b2d..e76ace30d436 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void) return ret; } +static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); + void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); @@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), entry.val); - if (page) + if (page) { INC_CACHE_INFO(find_success); + if (TestClearPageReadahead(page)) + atomic_inc(&swapin_readahead_hits); + } INC_CACHE_INFO(find_total); return page; @@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return found_page; } +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int pages, max_pages, last_ra; + static atomic_t last_readahead_pages; + + max_pages = 1 << ACCESS_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + /* + * This heuristic has been found to work well on both sequential and + * random loads, swapping to hard disk or to SSD: please don't ask + * what the "+ 2" means, it just happens to work well, that's all. + */ + pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + if (pages == 2) { + /* + * We can have no readahead hits to judge by: but must not get + * stuck here forever, so check for an adjacent offset instead + * (and don't even bother to check whether swap type is same). + */ + if (offset != prev_offset + 1 && offset != prev_offset - 1) + pages = 1; + prev_offset = offset; + } else { + unsigned int roundup = 4; + while (roundup < pages) + roundup <<= 1; + pages = roundup; + } + + if (pages > max_pages) + pages = max_pages; + + /* Don't shrink readahead too fast */ + last_ra = atomic_read(&last_readahead_pages) / 2; + if (pages < last_ra) + pages = last_ra; + atomic_set(&last_readahead_pages, pages); + + return pages; +} + /** * swapin_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory @@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr) { struct page *page; - unsigned long offset = swp_offset(entry); + unsigned long entry_offset = swp_offset(entry); + unsigned long offset = entry_offset; unsigned long start_offset, end_offset; - unsigned long mask = (1UL << page_cluster) - 1; + unsigned long mask; struct blk_plug plug; + mask = swapin_nr_pages(offset) - 1; + if (!mask) + goto skip; + /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, gfp_mask, vma, addr); if (!page) continue; + if (offset != entry_offset) + SetPageReadahead(page); page_cache_release(page); } blk_finish_plug(&plug); lru_add_drain(); /* Push any new pages onto the LRU now */ +skip: return read_swap_cache_async(entry, gfp_mask, vma, addr); } -- cgit v1.2.3 From 78c0f98cc9dd46824fa66f35f14ea24ba733d145 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 30 Jan 2014 13:49:48 +0200 Subject: IB/mlx5: Fix binary compatibility with libmlx5 Commit c1be5232d21d ("Fix micro UAR allocator") broke binary compatibility between libmlx5 and mlx5_ib since it defines a different value to the number of micro UARs per page, leading to wrong calculation in libmlx5. This patch defines struct mlx5_ib_alloc_ucontext_req_v2 as an extension to struct mlx5_ib_alloc_ucontext_req. The extended size is determined in mlx5_ib_alloc_ucontext() and in case of old library we use uuarn 0 which works fine -- this is acheived due to create_user_qp() falling back from high to medium then to low class where low class will return 0. For new libraries we use the more sophisticated allocation algorithm. Signed-off-by: Eli Cohen Reviewed-by: Yann Droneaud Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/main.c | 19 +++++++++++++++++-- drivers/infiniband/hw/mlx5/qp.c | 10 ++++++++-- drivers/infiniband/hw/mlx5/user.h | 7 +++++++ include/linux/mlx5/driver.h | 1 + 4 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 9660d093f8cf..f4ef4a24d410 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -536,24 +536,38 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_ib_alloc_ucontext_req req; + struct mlx5_ib_alloc_ucontext_req_v2 req; struct mlx5_ib_alloc_ucontext_resp resp; struct mlx5_ib_ucontext *context; struct mlx5_uuar_info *uuari; struct mlx5_uar *uars; int gross_uuars; int num_uars; + int ver; int uuarn; int err; int i; + int reqlen; if (!dev->ib_active) return ERR_PTR(-EAGAIN); - err = ib_copy_from_udata(&req, udata, sizeof(req)); + memset(&req, 0, sizeof(req)); + reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); + if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) + ver = 0; + else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) + ver = 2; + else + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&req, udata, reqlen); if (err) return ERR_PTR(err); + if (req.flags || req.reserved) + return ERR_PTR(-EINVAL); + if (req.total_num_uuars > MLX5_MAX_UUARS) return ERR_PTR(-ENOMEM); @@ -626,6 +640,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (err) goto out_uars; + uuari->ver = ver; uuari->num_low_latency_uuars = req.num_low_latency_uuars; uuari->uars = uars; uuari->num_uars = num_uars; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 492dc330e907..091576a777e9 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -430,11 +430,17 @@ static int alloc_uuar(struct mlx5_uuar_info *uuari, break; case MLX5_IB_LATENCY_CLASS_MEDIUM: - uuarn = alloc_med_class_uuar(uuari); + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_med_class_uuar(uuari); break; case MLX5_IB_LATENCY_CLASS_HIGH: - uuarn = alloc_high_class_uuar(uuari); + if (uuari->ver < 2) + uuarn = -ENOMEM; + else + uuarn = alloc_high_class_uuar(uuari); break; case MLX5_IB_LATENCY_CLASS_FAST_PATH: diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index 32a2a5dfc523..0f4f8e42a17f 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -62,6 +62,13 @@ struct mlx5_ib_alloc_ucontext_req { __u32 num_low_latency_uuars; }; +struct mlx5_ib_alloc_ucontext_req_v2 { + __u32 total_num_uuars; + __u32 num_low_latency_uuars; + __u32 flags; + __u32 reserved; +}; + struct mlx5_ib_alloc_ucontext_resp { __u32 qp_tab_size; __u32 bf_reg_size; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 554548cd3dd4..32cb18c399c2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -227,6 +227,7 @@ struct mlx5_uuar_info { * protect uuar allocation data structs */ struct mutex lock; + u32 ver; }; struct mlx5_bf { -- cgit v1.2.3 From e28bab4828354583bb66ac09021ca69b341a7db4 Mon Sep 17 00:00:00 2001 From: "K. Y. Srinivasan" Date: Wed, 15 Jan 2014 17:12:58 -0800 Subject: Drivers: hv: vmbus: Specify the target CPU that should receive notification During the initial VMBUS connect phase, starting with WS2012 R2, we should specify the VPCU in the guest that should receive the notification. Fix this issue. This fix is required to properly connect to the host in the kexeced kernel. Signed-off-by: K. Y. Srinivasan Cc: [3.9+] Signed-off-by: Greg Kroah-Hartman --- drivers/hv/connection.c | 2 ++ include/linux/hyperv.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index af6edf9b1936..855bbda9964d 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -78,6 +78,8 @@ static int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, msg->interrupt_page = virt_to_phys(vmbus_connection.int_page); msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]); msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]); + if (version == VERSION_WIN8) + msg->target_vcpu = hv_context.vp_index[smp_processor_id()]; /* * Add to list before we send the request since we may diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 15da677478dd..344883dce584 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -875,7 +875,7 @@ struct vmbus_channel_relid_released { struct vmbus_channel_initiate_contact { struct vmbus_channel_message_header header; u32 vmbus_version_requested; - u32 padding2; + u32 target_vcpu; /* The VCPU the host should respond to */ u64 interrupt_page; u64 monitor_page1; u64 monitor_page2; -- cgit v1.2.3 From 72a0a36e2854a6eadb4cf2561858f613f9cd4639 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Feb 2014 10:22:36 -0800 Subject: blk-mq: support at_head inserations for blk_execute_rq This is neede for proper SG_IO operation as well as various uses of blk_execute_rq from the SCSI midlayer. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-exec.c | 2 +- block/blk-mq.c | 17 ++++++++++------- include/linux/blk-mq.h | 3 ++- 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/block/blk-exec.c b/block/blk-exec.c index bbfc072a79c2..c68613bb4c79 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -65,7 +65,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * be resued after dying flag is set */ if (q->mq_ops) { - blk_mq_insert_request(q, rq, true); + blk_mq_insert_request(q, rq, at_head, true); return; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 9072d0ab184f..c9306e3403fe 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -714,13 +714,16 @@ static void blk_mq_work_fn(struct work_struct *work) } static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq) + struct request *rq, bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; trace_block_rq_insert(hctx->queue, rq); - list_add_tail(&rq->queuelist, &ctx->rq_list); + if (at_head) + list_add(&rq->queuelist, &ctx->rq_list); + else + list_add_tail(&rq->queuelist, &ctx->rq_list); blk_mq_hctx_mark_pending(hctx, ctx); /* @@ -730,7 +733,7 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, } void blk_mq_insert_request(struct request_queue *q, struct request *rq, - bool run_queue) + bool at_head, bool run_queue) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx, *current_ctx; @@ -749,7 +752,7 @@ void blk_mq_insert_request(struct request_queue *q, struct request *rq, rq->mq_ctx = ctx; } spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, at_head); spin_unlock(&ctx->lock); blk_mq_put_ctx(current_ctx); @@ -781,7 +784,7 @@ void blk_mq_run_request(struct request *rq, bool run_queue, bool async) /* ctx->cpu might be offline */ spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, false); spin_unlock(&ctx->lock); blk_mq_put_ctx(current_ctx); @@ -819,7 +822,7 @@ static void blk_mq_insert_requests(struct request_queue *q, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); rq->mq_ctx = ctx; - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, false); } spin_unlock(&ctx->lock); @@ -971,7 +974,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) __blk_mq_free_request(hctx, ctx, rq); else { blk_mq_bio_to_request(rq, bio); - __blk_mq_insert_request(hctx, rq); + __blk_mq_insert_request(hctx, rq, false); } spin_unlock(&ctx->lock); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1e8f16f65af4..b7638be58599 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -122,7 +122,8 @@ void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struc void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); -void blk_mq_insert_request(struct request_queue *, struct request *, bool); +void blk_mq_insert_request(struct request_queue *, struct request *, + bool, bool); void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -- cgit v1.2.3 From 3d4b81eda2211f32886e2978daf6f39885042fc4 Mon Sep 17 00:00:00 2001 From: Sarah Sharp Date: Fri, 31 Jan 2014 11:52:57 -0800 Subject: Revert "usb: xhci: Link TRB must not occur within a USB payload burst" This reverts commit 35773dac5f862cb1c82ea151eba3e2f6de51ec3e. It's a hack that caused regressions in the usb-storage and userspace USB drivers that use usbfs and libusb. Commit 70cabb7d992f "xhci 1.0: Limit arbitrarily-aligned scatter gather." should fix the issues seen with the ax88179_178a driver on xHCI 1.0 hosts, without causing regressions. Signed-off-by: Sarah Sharp Cc: stable@vger.kernel.org # 3.12 --- drivers/usb/host/xhci-ring.c | 54 ++------------------------------------------ include/linux/usb.h | 2 -- 2 files changed, 2 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 0272450a1667..0ed64eb68e48 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -2967,58 +2967,8 @@ static int prepare_ring(struct xhci_hcd *xhci, struct xhci_ring *ep_ring, } while (1) { - if (room_on_ring(xhci, ep_ring, num_trbs)) { - union xhci_trb *trb = ep_ring->enqueue; - unsigned int usable = ep_ring->enq_seg->trbs + - TRBS_PER_SEGMENT - 1 - trb; - u32 nop_cmd; - - /* - * Section 4.11.7.1 TD Fragments states that a link - * TRB must only occur at the boundary between - * data bursts (eg 512 bytes for 480M). - * While it is possible to split a large fragment - * we don't know the size yet. - * Simplest solution is to fill the trb before the - * LINK with nop commands. - */ - if (num_trbs == 1 || num_trbs <= usable || usable == 0) - break; - - if (ep_ring->type != TYPE_BULK) - /* - * While isoc transfers might have a buffer that - * crosses a 64k boundary it is unlikely. - * Since we can't add NOPs without generating - * gaps in the traffic just hope it never - * happens at the end of the ring. - * This could be fixed by writing a LINK TRB - * instead of the first NOP - however the - * TRB_TYPE_LINK_LE32() calls would all need - * changing to check the ring length. - */ - break; - - if (num_trbs >= TRBS_PER_SEGMENT) { - xhci_err(xhci, "Too many fragments %d, max %d\n", - num_trbs, TRBS_PER_SEGMENT - 1); - return -ENOMEM; - } - - nop_cmd = cpu_to_le32(TRB_TYPE(TRB_TR_NOOP) | - ep_ring->cycle_state); - ep_ring->num_trbs_free -= usable; - do { - trb->generic.field[0] = 0; - trb->generic.field[1] = 0; - trb->generic.field[2] = 0; - trb->generic.field[3] = nop_cmd; - trb++; - } while (--usable); - ep_ring->enqueue = trb; - if (room_on_ring(xhci, ep_ring, num_trbs)) - break; - } + if (room_on_ring(xhci, ep_ring, num_trbs)) + break; if (ep_ring == xhci->cmd_ring) { xhci_err(xhci, "Do not support expand command ring\n"); diff --git a/include/linux/usb.h b/include/linux/usb.h index c716da18c668..7f6eb859873e 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1265,8 +1265,6 @@ typedef void (*usb_complete_t)(struct urb *); * @sg: scatter gather buffer list, the buffer size of each element in * the list (except the last) must be divisible by the endpoint's * max packet size if no_sg_constraint isn't set in 'struct usb_bus' - * (FIXME: scatter-gather under xHCI is broken for periodic transfers. - * Do not use urb->sg for interrupt endpoints for now, only bulk.) * @num_mapped_sgs: (internal) number of mapped sg entries * @num_sgs: number of entries in the sg list * @transfer_buffer_length: How big is transfer_buffer. The transfer may -- cgit v1.2.3 From 0668d3065128d39449c097e62dbdb5707820137d Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 2 Jan 2014 16:37:32 -0800 Subject: genirq: Add devm_request_any_context_irq() Some drivers use request_any_context_irq() but there isn't a devm_* function for it. Add one so that these drivers don't need to explicitly free the irq on driver detach. Signed-off-by: Stephen Boyd Cc: linux-arm-kernel@lists.infradead.org Cc: Dmitry Torokhov Link: http://lkml.kernel.org/r/1388709460-19222-3-git-send-email-sboyd@codeaurora.org Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 5 +++++ kernel/irq/devres.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 0053adde0ed9..a2678d35b5a2 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -158,6 +158,11 @@ devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler, devname, dev_id); } +extern int __must_check +devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id); + extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); /* diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index bd8e788d71e0..1ef0606797c9 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -72,6 +72,51 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq, } EXPORT_SYMBOL(devm_request_threaded_irq); +/** + * devm_request_any_context_irq - allocate an interrupt line for a managed device + * @dev: device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @thread_fn: function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as + * request_any_context_irq(). IRQs requested with this function will be + * automatically freed on driver detach. + * + * If an IRQ allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + */ +int devm_request_any_context_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + struct irq_devres *dr; + int rc; + + dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), + GFP_KERNEL); + if (!dr) + return -ENOMEM; + + rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id); + if (rc) { + devres_free(dr); + return rc; + } + + dr->irq = irq; + dr->dev_id = dev_id; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL(devm_request_any_context_irq); + /** * devm_free_irq - free an interrupt * @dev: device to free interrupt for -- cgit v1.2.3 From d311d79de305f1ada47cadd672e6ed1b28a949eb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Feb 2014 15:18:09 -0500 Subject: fix O_SYNC|O_APPEND syncing the wrong range on write() It actually goes back to 2004 ([PATCH] Concurrent O_SYNC write support) when sync_page_range() had been introduced; generic_file_write{,v}() correctly synced pos_after_write - written .. pos_after_write - 1 but generic_file_aio_write() synced pos_before_write .. pos_before_write + written - 1 instead. Which is not the same thing with O_APPEND, obviously. A couple of years later correct variant had been killed off when everything switched to use of generic_file_aio_write(). All users of generic_file_aio_write() are affected, and the same bug has been copied into other instances of ->aio_write(). The fix is trivial; the only subtle point is that generic_write_sync() ought to be inlined to avoid calculations useless for the majority of calls. Signed-off-by: Al Viro --- fs/cifs/file.c | 4 ++-- fs/ext4/file.c | 2 +- fs/ntfs/file.c | 2 +- fs/sync.c | 17 ----------------- fs/xfs/xfs_file.c | 2 +- include/linux/fs.h | 8 +++++++- mm/filemap.c | 4 ++-- 7 files changed, 14 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 853d6d1cc822..a7eda8ebfacc 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2559,8 +2559,8 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, if (rc > 0) { ssize_t err; - err = generic_write_sync(file, pos, rc); - if (err < 0 && rc > 0) + err = generic_write_sync(file, iocb->ki_pos - rc, rc); + if (err < 0) rc = err; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 43e64f6022eb..1a5073959f32 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -152,7 +152,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, if (ret > 0) { ssize_t err; - err = generic_write_sync(file, pos, ret); + err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0 && ret > 0) ret = err; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index ea4ba9daeb47..db9bd8a31725 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2134,7 +2134,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); if (ret > 0) { - int err = generic_write_sync(file, pos, ret); + int err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } diff --git a/fs/sync.c b/fs/sync.c index f15537452231..e8ba024a055b 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -222,23 +222,6 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) return do_fsync(fd, 1); } -/** - * generic_write_sync - perform syncing after a write if file / inode is sync - * @file: file to which the write happened - * @pos: offset where the write started - * @count: length of the write - * - * This is just a simple wrapper about our general syncing function. - */ -int generic_write_sync(struct file *file, loff_t pos, loff_t count) -{ - if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) - return 0; - return vfs_fsync_range(file, pos, pos + count - 1, - (file->f_flags & __O_SYNC) ? 0 : 1); -} -EXPORT_SYMBOL(generic_write_sync); - /* * sys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2e7989e3a2d6..64b48eade91d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -799,7 +799,7 @@ xfs_file_aio_write( XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ - err = generic_write_sync(file, pos, ret); + err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 09f553c59813..75ff961be051 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2273,7 +2273,13 @@ extern int filemap_fdatawrite_range(struct address_space *mapping, extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); -extern int generic_write_sync(struct file *file, loff_t pos, loff_t count); +static inline int generic_write_sync(struct file *file, loff_t pos, loff_t count) +{ + if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) + return 0; + return vfs_fsync_range(file, pos, pos + count - 1, + (file->f_flags & __O_SYNC) ? 0 : 1); +} extern void emergency_sync(void); extern void emergency_remount(void); #ifdef CONFIG_BLOCK diff --git a/mm/filemap.c b/mm/filemap.c index d56d3c145b9f..7a13f6ac5421 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2553,8 +2553,8 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ret > 0) { ssize_t err; - err = generic_write_sync(file, pos, ret); - if (err < 0 && ret > 0) + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) ret = err; } return ret; -- cgit v1.2.3 From c4540a7d8c1e595560e53acedf88901daf15a2b5 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Sun, 9 Feb 2014 18:30:39 +0530 Subject: fs: Add prototype declaration to appropriate header file include/linux/bio.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add prototype declaration to header file include/linux/bio.h because it is used by more than one file. This eliminates the following warning in bio-integrity.c: fs/bio-integrity.c:214:14: warning: no previous prototype for ‘bio_integrity_tag_size’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Reviewed-by: Josh Triplett Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 70654521dab6..d6791bba8264 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -332,6 +332,7 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); extern struct bio_set *fs_bio_set; +unsigned int bio_integrity_tag_size(struct bio *bio); static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { -- cgit v1.2.3 From 30a91cb4ef385fe1b260df204ef314d86fff2850 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Feb 2014 03:24:38 -0800 Subject: blk-mq: rework I/O completions Rework I/O completions to work more like the old code path. blk_mq_end_io now stays out of the business of deferring completions to others CPUs and calling blk_mark_rq_complete. The latter is very important to allow completing requests that have timed out and thus are already marked completed, the former allows using the IPI callout even for driver specific completions instead of having to reimplement them. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 52 ++++++++++++++++++++++++++++++-------------------- block/blk-mq.h | 3 +-- block/blk-timeout.c | 2 +- include/linux/blk-mq.h | 4 ++++ 4 files changed, 37 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index cee96234bf58..14c8f35946e1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -326,7 +326,7 @@ static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) bio_endio(bio, error); } -void blk_mq_complete_request(struct request *rq, int error) +void blk_mq_end_io(struct request *rq, int error) { struct bio *bio = rq->bio; unsigned int bytes = 0; @@ -351,46 +351,53 @@ void blk_mq_complete_request(struct request *rq, int error) else blk_mq_free_request(rq); } +EXPORT_SYMBOL(blk_mq_end_io); -void __blk_mq_end_io(struct request *rq, int error) -{ - if (!blk_mark_rq_complete(rq)) - blk_mq_complete_request(rq, error); -} - -static void blk_mq_end_io_remote(void *data) +static void __blk_mq_complete_request_remote(void *data) { struct request *rq = data; - __blk_mq_end_io(rq, rq->errors); + rq->q->softirq_done_fn(rq); } -/* - * End IO on this request on a multiqueue enabled driver. We'll either do - * it directly inline, or punt to a local IPI handler on the matching - * remote CPU. - */ -void blk_mq_end_io(struct request *rq, int error) +void __blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; int cpu; - if (!ctx->ipi_redirect) - return __blk_mq_end_io(rq, error); + if (!ctx->ipi_redirect) { + rq->q->softirq_done_fn(rq); + return; + } cpu = get_cpu(); if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { - rq->errors = error; - rq->csd.func = blk_mq_end_io_remote; + rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; __smp_call_function_single(ctx->cpu, &rq->csd, 0); } else { - __blk_mq_end_io(rq, error); + rq->q->softirq_done_fn(rq); } put_cpu(); } -EXPORT_SYMBOL(blk_mq_end_io); + +/** + * blk_mq_complete_request - end I/O on a request + * @rq: the request being processed + * + * Description: + * Ends all I/O on a request. It does not handle partial completions. + * The actual completion happens out-of-order, through a IPI handler. + **/ +void blk_mq_complete_request(struct request *rq) +{ + if (unlikely(blk_should_fake_timeout(rq->q))) + return; + if (!blk_mark_rq_complete(rq)) + __blk_mq_complete_request(rq); +} +EXPORT_SYMBOL(blk_mq_complete_request); static void blk_mq_start_request(struct request *rq) { @@ -1399,6 +1406,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, if (reg->timeout) blk_queue_rq_timeout(q, reg->timeout); + if (reg->ops->complete) + blk_queue_softirq_done(q, reg->ops->complete); + blk_mq_init_flush(q); blk_mq_init_cpu_queues(q, reg->nr_hw_queues); diff --git a/block/blk-mq.h b/block/blk-mq.h index 5c3917984b00..f29b645f0e1c 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -22,8 +22,7 @@ struct blk_mq_ctx { struct kobject kobj; }; -void __blk_mq_end_io(struct request *rq, int error); -void blk_mq_complete_request(struct request *rq, int error); +void __blk_mq_complete_request(struct request *rq); void blk_mq_run_request(struct request *rq, bool run_queue, bool async); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_init_flush(struct request_queue *q); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index bba81c9348e1..d96f7061c6fd 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -91,7 +91,7 @@ static void blk_rq_timed_out(struct request *req) case BLK_EH_HANDLED: /* Can we use req->errors here? */ if (q->mq_ops) - blk_mq_complete_request(req, req->errors); + __blk_mq_complete_request(req); else __blk_complete_request(req); break; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b7638be58599..468be242db90 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -86,6 +86,8 @@ struct blk_mq_ops { */ rq_timed_out_fn *timeout; + softirq_done_fn *complete; + /* * Override for hctx allocations (should probably go) */ @@ -137,6 +139,8 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int); void blk_mq_end_io(struct request *rq, int error); +void blk_mq_complete_request(struct request *rq); + void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); -- cgit v1.2.3 From 18741986a4b1dc4b1f171634c4191abc3b0fa023 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Feb 2014 09:29:00 -0700 Subject: blk-mq: rework flush sequencing logic Witch to using a preallocated flush_rq for blk-mq similar to what's done with the old request path. This allows us to set up the request properly with a tag from the actually allowed range and ->rq_disk as needed by some drivers. To make life easier we also switch to dynamic allocation of ->flush_rq for the old path. This effectively reverts most of "blk-mq: fix for flush deadlock" and "blk-mq: Don't reserve a tag for flush request" Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 15 +++++-- block/blk-flush.c | 105 ++++++++++++++++++------------------------------- block/blk-mq.c | 54 +++++++++---------------- block/blk-mq.h | 1 + block/blk-sysfs.c | 2 + include/linux/blk-mq.h | 5 +-- include/linux/blkdev.h | 11 ++---- 7 files changed, 76 insertions(+), 117 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index 06636f3ad424..853f92749202 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -693,11 +693,20 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) if (!uninit_q) return NULL; + uninit_q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); + if (!uninit_q->flush_rq) + goto out_cleanup_queue; + q = blk_init_allocated_queue(uninit_q, rfn, lock); if (!q) - blk_cleanup_queue(uninit_q); - + goto out_free_flush_rq; return q; + +out_free_flush_rq: + kfree(uninit_q->flush_rq); +out_cleanup_queue: + blk_cleanup_queue(uninit_q); + return NULL; } EXPORT_SYMBOL(blk_init_queue_node); @@ -1127,7 +1136,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) { if (q->mq_ops) - return blk_mq_alloc_request(q, rw, gfp_mask, false); + return blk_mq_alloc_request(q, rw, gfp_mask); else return blk_old_get_request(q, rw, gfp_mask); } diff --git a/block/blk-flush.c b/block/blk-flush.c index 9143e85226c7..66e2b697f5db 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -130,20 +130,26 @@ static void blk_flush_restore_request(struct request *rq) blk_clear_rq_complete(rq); } -static void mq_flush_data_run(struct work_struct *work) +static void mq_flush_run(struct work_struct *work) { struct request *rq; - rq = container_of(work, struct request, mq_flush_data); + rq = container_of(work, struct request, mq_flush_work); memset(&rq->csd, 0, sizeof(rq->csd)); blk_mq_run_request(rq, true, false); } -static void blk_mq_flush_data_insert(struct request *rq) +static bool blk_flush_queue_rq(struct request *rq) { - INIT_WORK(&rq->mq_flush_data, mq_flush_data_run); - kblockd_schedule_work(rq->q, &rq->mq_flush_data); + if (rq->q->mq_ops) { + INIT_WORK(&rq->mq_flush_work, mq_flush_run); + kblockd_schedule_work(rq->q, &rq->mq_flush_work); + return false; + } else { + list_add_tail(&rq->queuelist, &rq->q->queue_head); + return true; + } } /** @@ -187,12 +193,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &q->flush_data_in_flight); - if (q->mq_ops) - blk_mq_flush_data_insert(rq); - else { - list_add(&rq->queuelist, &q->queue_head); - queued = true; - } + queued = blk_flush_queue_rq(rq); break; case REQ_FSEQ_DONE: @@ -216,9 +217,6 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, } kicked = blk_kick_flush(q); - /* blk_mq_run_flush will run queue */ - if (q->mq_ops) - return queued; return kicked | queued; } @@ -230,10 +228,9 @@ static void flush_end_io(struct request *flush_rq, int error) struct request *rq, *n; unsigned long flags = 0; - if (q->mq_ops) { - blk_mq_free_request(flush_rq); + if (q->mq_ops) spin_lock_irqsave(&q->mq_flush_lock, flags); - } + running = &q->flush_queue[q->flush_running_idx]; BUG_ON(q->flush_pending_idx == q->flush_running_idx); @@ -263,48 +260,14 @@ static void flush_end_io(struct request *flush_rq, int error) * kblockd. */ if (queued || q->flush_queue_delayed) { - if (!q->mq_ops) - blk_run_queue_async(q); - else - /* - * This can be optimized to only run queues with requests - * queued if necessary. - */ - blk_mq_run_queues(q, true); + WARN_ON(q->mq_ops); + blk_run_queue_async(q); } q->flush_queue_delayed = 0; if (q->mq_ops) spin_unlock_irqrestore(&q->mq_flush_lock, flags); } -static void mq_flush_work(struct work_struct *work) -{ - struct request_queue *q; - struct request *rq; - - q = container_of(work, struct request_queue, mq_flush_work); - - rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, - __GFP_WAIT|GFP_ATOMIC, false); - rq->cmd_type = REQ_TYPE_FS; - rq->end_io = flush_end_io; - - blk_mq_run_request(rq, true, false); -} - -/* - * We can't directly use q->flush_rq, because it doesn't have tag and is not in - * hctx->rqs[]. so we must allocate a new request, since we can't sleep here, - * so offload the work to workqueue. - * - * Note: we assume a flush request finished in any hardware queue will flush - * the whole disk cache. - */ -static void mq_run_flush(struct request_queue *q) -{ - kblockd_schedule_work(q, &q->mq_flush_work); -} - /** * blk_kick_flush - consider issuing flush request * @q: request_queue being kicked @@ -339,19 +302,31 @@ static bool blk_kick_flush(struct request_queue *q) * different from running_idx, which means flush is in flight. */ q->flush_pending_idx ^= 1; + if (q->mq_ops) { - mq_run_flush(q); - return true; + struct blk_mq_ctx *ctx = first_rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); + + blk_mq_rq_init(hctx, q->flush_rq); + q->flush_rq->mq_ctx = ctx; + + /* + * Reuse the tag value from the fist waiting request, + * with blk-mq the tag is generated during request + * allocation and drivers can rely on it being inside + * the range they asked for. + */ + q->flush_rq->tag = first_rq->tag; + } else { + blk_rq_init(q, q->flush_rq); } - blk_rq_init(q, &q->flush_rq); - q->flush_rq.cmd_type = REQ_TYPE_FS; - q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; - q->flush_rq.rq_disk = first_rq->rq_disk; - q->flush_rq.end_io = flush_end_io; + q->flush_rq->cmd_type = REQ_TYPE_FS; + q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; + q->flush_rq->rq_disk = first_rq->rq_disk; + q->flush_rq->end_io = flush_end_io; - list_add_tail(&q->flush_rq.queuelist, &q->queue_head); - return true; + return blk_flush_queue_rq(q->flush_rq); } static void flush_data_end_io(struct request *rq, int error) @@ -407,11 +382,8 @@ void blk_insert_flush(struct request *rq) /* * @policy now records what operations need to be done. Adjust * REQ_FLUSH and FUA for the driver. - * We keep REQ_FLUSH for mq to track flush requests. For !FUA, - * we never dispatch the request directly. */ - if (rq->cmd_flags & REQ_FUA) - rq->cmd_flags &= ~REQ_FLUSH; + rq->cmd_flags &= ~REQ_FLUSH; if (!(fflags & REQ_FUA)) rq->cmd_flags &= ~REQ_FUA; @@ -560,5 +532,4 @@ EXPORT_SYMBOL(blkdev_issue_flush); void blk_mq_init_flush(struct request_queue *q) { spin_lock_init(&q->mq_flush_lock); - INIT_WORK(&q->mq_flush_work, mq_flush_work); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 14c8f35946e1..a59b0565e940 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -194,27 +194,9 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, } static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, - gfp_t gfp, bool reserved, - int rw) + gfp_t gfp, bool reserved) { - struct request *req; - bool is_flush = false; - /* - * flush need allocate a request, leave at least one request for - * non-flush IO to avoid deadlock - */ - if ((rw & REQ_FLUSH) && !(rw & REQ_FLUSH_SEQ)) { - if (atomic_inc_return(&hctx->pending_flush) >= - hctx->queue_depth - hctx->reserved_tags - 1) { - atomic_dec(&hctx->pending_flush); - return NULL; - } - is_flush = true; - } - req = blk_mq_alloc_rq(hctx, gfp, reserved); - if (!req && is_flush) - atomic_dec(&hctx->pending_flush); - return req; + return blk_mq_alloc_rq(hctx, gfp, reserved); } static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, @@ -227,7 +209,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); - rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved, rw); + rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); if (rq) { blk_mq_rq_ctx_init(q, ctx, rq, rw); break; @@ -244,15 +226,14 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, return rq; } -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, - gfp_t gfp, bool reserved) +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) { struct request *rq; if (blk_mq_queue_enter(q)) return NULL; - rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); + rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); if (rq) blk_mq_put_ctx(rq->mq_ctx); return rq; @@ -276,7 +257,7 @@ EXPORT_SYMBOL(blk_mq_alloc_reserved_request); /* * Re-init and set pdu, if we have it */ -static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) +void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_rq_init(hctx->queue, rq); @@ -290,9 +271,6 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; - if ((rq->cmd_flags & REQ_FLUSH) && !(rq->cmd_flags & REQ_FLUSH_SEQ)) - atomic_dec(&hctx->pending_flush); - blk_mq_rq_init(hctx, rq); blk_mq_put_tag(hctx->tags, tag); @@ -946,14 +924,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) hctx = q->mq_ops->map_queue(q, ctx->cpu); trace_block_getrq(q, bio, rw); - rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false, bio->bi_rw); + rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); if (likely(rq)) - blk_mq_rq_ctx_init(q, ctx, rq, bio->bi_rw); + blk_mq_rq_ctx_init(q, ctx, rq, rw); else { blk_mq_put_ctx(ctx); trace_block_sleeprq(q, bio, rw); - rq = blk_mq_alloc_request_pinned(q, bio->bi_rw, - __GFP_WAIT|GFP_ATOMIC, false); + rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, + false); ctx = rq->mq_ctx; hctx = q->mq_ops->map_queue(q, ctx->cpu); } @@ -1230,9 +1208,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q, hctx->queue_num = i; hctx->flags = reg->flags; hctx->queue_depth = reg->queue_depth; - hctx->reserved_tags = reg->reserved_tags; hctx->cmd_size = reg->cmd_size; - atomic_set(&hctx->pending_flush, 0); blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1412,9 +1388,14 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, blk_mq_init_flush(q); blk_mq_init_cpu_queues(q, reg->nr_hw_queues); - if (blk_mq_init_hw_queues(q, reg, driver_data)) + q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size, + cache_line_size()), GFP_KERNEL); + if (!q->flush_rq) goto err_hw; + if (blk_mq_init_hw_queues(q, reg, driver_data)) + goto err_flush_rq; + blk_mq_map_swqueue(q); mutex_lock(&all_q_mutex); @@ -1422,6 +1403,9 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, mutex_unlock(&all_q_mutex); return q; + +err_flush_rq: + kfree(q->flush_rq); err_hw: kfree(q->mq_map); err_map: diff --git a/block/blk-mq.h b/block/blk-mq.h index f29b645f0e1c..ed0035cd458e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -28,6 +28,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_init_flush(struct request_queue *q); void blk_mq_drain_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); +void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq); /* * CPU hotplug helpers diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8095c4a21fc0..7500f876dae4 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -549,6 +549,8 @@ static void blk_release_queue(struct kobject *kobj) if (q->mq_ops) blk_mq_free_queue(q); + kfree(q->flush_rq); + blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 468be242db90..18ba8a627f46 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -36,15 +36,12 @@ struct blk_mq_hw_ctx { struct list_head page_list; struct blk_mq_tags *tags; - atomic_t pending_flush; - unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 10 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; unsigned int queue_depth; - unsigned int reserved_tags; unsigned int numa_node; unsigned int cmd_size; /* per-request extra data */ @@ -129,7 +126,7 @@ void blk_mq_insert_request(struct request_queue *, struct request *, void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_free_request(struct request *rq); bool blk_mq_can_queue(struct blk_mq_hw_ctx *); -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, bool reserved); +struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0375654adb28..b2d25ecbcbc1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -101,7 +101,7 @@ struct request { }; union { struct call_single_data csd; - struct work_struct mq_flush_data; + struct work_struct mq_flush_work; }; struct request_queue *q; @@ -451,13 +451,8 @@ struct request_queue { unsigned long flush_pending_since; struct list_head flush_queue[2]; struct list_head flush_data_in_flight; - union { - struct request flush_rq; - struct { - spinlock_t mq_flush_lock; - struct work_struct mq_flush_work; - }; - }; + struct request *flush_rq; + spinlock_t mq_flush_lock; struct mutex sysfs_lock; -- cgit v1.2.3 From fb37bb04d6c8d6c44e61d9da189dcfa6aa0f135e Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 10 Feb 2014 14:25:49 -0800 Subject: smp.h: fix x86+cpu.c sparse warnings about arch nonboot CPU calls Use what we already do for arch_disable_smp_support() to fix these: arch/x86/kernel/smpboot.c:1155:6: warning: symbol 'arch_enable_nonboot_cpus_begin' was not declared. Should it be static? arch/x86/kernel/smpboot.c:1160:6: warning: symbol 'arch_enable_nonboot_cpus_end' was not declared. Should it be static? kernel/cpu.c:512:13: warning: symbol 'arch_enable_nonboot_cpus_begin' was not declared. Should it be static? kernel/cpu.c:516:13: warning: symbol 'arch_enable_nonboot_cpus_end' was not declared. Should it be static? Signed-off-by: Paul Gortmaker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index 3834f43f9993..6ae004e437ea 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -188,6 +188,9 @@ static inline void kick_all_cpus_sync(void) { } */ extern void arch_disable_smp_support(void); +extern void arch_enable_nonboot_cpus_begin(void); +extern void arch_enable_nonboot_cpus_end(void); + void smp_setup_processor_id(void); #endif /* __LINUX_SMP_H */ -- cgit v1.2.3 From 0ab02ca8f887908152d1a96db5130fc661d36a1e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 11 Feb 2014 16:05:46 +0800 Subject: cgroup: protect modifications to cgroup_idr with cgroup_mutex Setup cgroupfs like this: # mount -t cgroup -o cpuacct xxx /cgroup # mkdir /cgroup/sub1 # mkdir /cgroup/sub2 Then run these two commands: # for ((; ;)) { mkdir /cgroup/sub1/tmp && rmdir /mnt/sub1/tmp; } & # for ((; ;)) { mkdir /cgroup/sub2/tmp && rmdir /mnt/sub2/tmp; } & After seconds you may see this warning: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 25243 at lib/idr.c:527 sub_remove+0x87/0x1b0() idr_remove called for id=6 which is not allocated. ... Call Trace: [] dump_stack+0x7a/0x96 [] warn_slowpath_common+0x8c/0xc0 [] warn_slowpath_fmt+0x46/0x50 [] sub_remove+0x87/0x1b0 [] ? css_killed_work_fn+0x32/0x1b0 [] idr_remove+0x25/0xd0 [] cgroup_destroy_css_killed+0x5b/0xc0 [] css_killed_work_fn+0x130/0x1b0 [] process_one_work+0x26c/0x550 [] worker_thread+0x12e/0x3b0 [] kthread+0xe6/0xf0 [] ret_from_fork+0x7c/0xb0 ---[ end trace 2d1577ec10cf80d0 ]--- It's because allocating/removing cgroup ID is not properly synchronized. The bug was introduced when we converted cgroup_ida to cgroup_idr. While synchronization is already done inside ida_simple_{get,remove}(), users are responsible for concurrent calls to idr_{alloc,remove}(). tj: Refreshed on top of b58c89986a77 ("cgroup: fix error return from cgroup_create()"). Fixes: 4e96ee8e981b ("cgroup: convert cgroup_ida to cgroup_idr") Cc: #3.12+ Reported-by: Michal Hocko Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 ++ kernel/cgroup.c | 34 ++++++++++++++++++---------------- 2 files changed, 20 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5c097596104b..9450f025fe0c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -166,6 +166,8 @@ struct cgroup { * * The ID of the root cgroup is always 0, and a new cgroup * will be assigned with a smallest available ID. + * + * Allocating/Removing ID must be protected by cgroup_mutex. */ int id; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3edf7163b84f..52719ce55dd3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * per-subsystem and moved to css->id so that lookups are * successful until the target css is released. */ + mutex_lock(&cgroup_mutex); idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + mutex_unlock(&cgroup_mutex); cgrp->id = -1; call_rcu(&cgrp->rcu_head, cgroup_free_rcu); @@ -4167,16 +4169,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } rcu_assign_pointer(cgrp->name, name); - /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); - if (cgrp->id < 0) { - err = -ENOMEM; - goto err_free_name; - } - /* * Only live parents can have children. Note that the liveliness * check isn't strictly necessary because cgroup_mkdir() and @@ -4186,7 +4178,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free_id; + goto err_free_name; + } + + /* + * Temporarily set the pointer to NULL, so idr_find() won't return + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); + if (cgrp->id < 0) { + err = -ENOMEM; + goto err_unlock; } /* Grab a reference on the superblock so the hierarchy doesn't @@ -4218,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) - goto err_unlock; + goto err_free_id; lockdep_assert_held(&dentry->d_inode->i_mutex); cgrp->serial_nr = cgroup_serial_nr_next++; @@ -4254,12 +4256,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return 0; -err_unlock: - mutex_unlock(&cgroup_mutex); - /* Release the reference count that we took on the superblock */ - deactivate_super(sb); err_free_id: idr_remove(&root->cgroup_idr, cgrp->id); + /* Release the reference count that we took on the superblock */ + deactivate_super(sb); +err_unlock: + mutex_unlock(&cgroup_mutex); err_free_name: kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: -- cgit v1.2.3 From 8423ae3d7a3cfe084865262cfaeba1359d405182 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2014 17:45:50 -0800 Subject: block: Fix cloning of discard/write same bios Immutable biovecs changed the way bio segments are treated in such a way that bio_for_each_segment() cannot now do what we want for discard/write same bios, since bi_size means something completely different for them. Fortunately discard and write same bios never have more than a single biovec, so bio_for_each_segment() is unnecessary and not terribly meaningful for them, but we still have to special case them in a few places. Signed-off-by: Kent Overstreet Tested-by: Richard W.M. Jones Signed-off-by: Jens Axboe --- fs/bio.c | 15 ++++++++++----- include/linux/bio.h | 11 +++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/bio.c b/fs/bio.c index 75c49a382239..8754e7b6eb49 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -611,7 +611,6 @@ EXPORT_SYMBOL(bio_clone_fast); struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, struct bio_set *bs) { - unsigned nr_iovecs = 0; struct bvec_iter iter; struct bio_vec bv; struct bio *bio; @@ -638,10 +637,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, * __bio_clone_fast() anyways. */ - bio_for_each_segment(bv, bio_src, iter) - nr_iovecs++; - - bio = bio_alloc_bioset(gfp_mask, nr_iovecs, bs); + bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); if (!bio) return NULL; @@ -650,9 +646,18 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + if (bio->bi_rw & REQ_DISCARD) + goto integrity_clone; + + if (bio->bi_rw & REQ_WRITE_SAME) { + bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; + goto integrity_clone; + } + bio_for_each_segment(bv, bio_src, iter) bio->bi_io_vec[bio->bi_vcnt++] = bv; +integrity_clone: if (bio_integrity(bio_src)) { int ret; diff --git a/include/linux/bio.h b/include/linux/bio.h index d6791bba8264..5a4d39b4686b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -250,6 +250,17 @@ static inline unsigned bio_segments(struct bio *bio) struct bio_vec bv; struct bvec_iter iter; + /* + * We special case discard/write same, because they interpret bi_size + * differently: + */ + + if (bio->bi_rw & REQ_DISCARD) + return 1; + + if (bio->bi_rw & REQ_WRITE_SAME) + return 1; + bio_for_each_segment(bv, bio, iter) segs++; -- cgit v1.2.3 From c0b00a525c127d0055c1df6283300e17f601a1a1 Mon Sep 17 00:00:00 2001 From: Sumit Semwal Date: Mon, 3 Feb 2014 15:09:12 +0530 Subject: dma-buf: update debugfs output Russell King observed 'wierd' looking output from debugfs, and also suggested better ways of getting device names (use KBUILD_MODNAME, dev_name()) This patch addresses these issues to make the debugfs output correct and better looking. While at it, replace seq_printf with seq_puts to remove the checkpatch.pl warnings. Reported-by: Russell King - ARM Linux Signed-off-by: Sumit Semwal --- drivers/base/dma-buf.c | 25 ++++++++++++------------- include/linux/dma-buf.h | 2 +- 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/dma-buf.c b/drivers/base/dma-buf.c index 1e16cbd61da2..61d6d62cc0d3 100644 --- a/drivers/base/dma-buf.c +++ b/drivers/base/dma-buf.c @@ -616,36 +616,35 @@ static int dma_buf_describe(struct seq_file *s) if (ret) return ret; - seq_printf(s, "\nDma-buf Objects:\n"); - seq_printf(s, "\texp_name\tsize\tflags\tmode\tcount\n"); + seq_puts(s, "\nDma-buf Objects:\n"); + seq_puts(s, "size\tflags\tmode\tcount\texp_name\n"); list_for_each_entry(buf_obj, &db_list.head, list_node) { ret = mutex_lock_interruptible(&buf_obj->lock); if (ret) { - seq_printf(s, - "\tERROR locking buffer object: skipping\n"); + seq_puts(s, + "\tERROR locking buffer object: skipping\n"); continue; } - seq_printf(s, "\t"); - - seq_printf(s, "\t%s\t%08zu\t%08x\t%08x\t%08ld\n", - buf_obj->exp_name, buf_obj->size, + seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\n", + buf_obj->size, buf_obj->file->f_flags, buf_obj->file->f_mode, - (long)(buf_obj->file->f_count.counter)); + (long)(buf_obj->file->f_count.counter), + buf_obj->exp_name); - seq_printf(s, "\t\tAttached Devices:\n"); + seq_puts(s, "\tAttached Devices:\n"); attach_count = 0; list_for_each_entry(attach_obj, &buf_obj->attachments, node) { - seq_printf(s, "\t\t"); + seq_puts(s, "\t"); - seq_printf(s, "%s\n", attach_obj->dev->init_name); + seq_printf(s, "%s\n", dev_name(attach_obj->dev)); attach_count++; } - seq_printf(s, "\n\t\tTotal %d devices attached\n", + seq_printf(s, "Total %d devices attached\n\n", attach_count); count++; diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index dfac5ed31120..f886985a28b2 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -171,7 +171,7 @@ struct dma_buf *dma_buf_export_named(void *priv, const struct dma_buf_ops *ops, size_t size, int flags, const char *); #define dma_buf_export(priv, ops, size, flags) \ - dma_buf_export_named(priv, ops, size, flags, __FILE__) + dma_buf_export_named(priv, ops, size, flags, KBUILD_MODNAME) int dma_buf_fd(struct dma_buf *dmabuf, int flags); struct dma_buf *dma_buf_get(int fd); -- cgit v1.2.3 From a9f180345f5378ac87d80ed0bea55ba421d83859 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Wed, 12 Feb 2014 23:01:07 -0800 Subject: compiler/gcc4: Make quirk for asm_volatile_goto() unconditional I started noticing problems with KVM guest destruction on Linux 3.12+, where guest memory wasn't being cleaned up. I bisected it down to the commit introducing the new 'asm goto'-based atomics, and found this quirk was later applied to those. Unfortunately, even with GCC 4.8.2 (which ostensibly fixed the known 'asm goto' bug) I am still getting some kind of miscompilation. If I enable the asm_volatile_goto quirk for my compiler, KVM guests are destroyed correctly and the memory is cleaned up. So make the quirk unconditional for now, until bug is found and fixed. Suggested-by: Linus Torvalds Signed-off-by: Steven Noonan Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Jakub Jelinek Cc: Richard Henderson Cc: Andrew Morton Cc: Oleg Nesterov Cc: Link: http://lkml.kernel.org/r/1392274867-15236-1-git-send-email-steven@uplinklabs.net Link: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 Signed-off-by: Ingo Molnar --- include/linux/compiler-gcc4.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h index ded429966c1f..2507fd2a1eb4 100644 --- a/include/linux/compiler-gcc4.h +++ b/include/linux/compiler-gcc4.h @@ -75,11 +75,7 @@ * * (asm goto is automatically volatile - the naming reflects this.) */ -#if GCC_VERSION <= 40801 -# define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) -#else -# define asm_volatile_goto(x...) do { asm goto(x); } while (0) -#endif +#define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0) #ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP #if GCC_VERSION >= 40400 -- cgit v1.2.3 From 3ce4e860e578f843db36a1f7357ba00aeaa7610f Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 13 Feb 2014 10:48:02 -0700 Subject: PCI/MSI: Add pci_enable_msi_exact() and pci_enable_msix_exact() The new functions are special cases for pci_enable_msi_range() and pci_enable_msix_range() when a particular number of MSI or MSI-X is needed. By contrast with pci_enable_msi_range() and pci_enable_msix_range() functions, pci_enable_msi_exact() and pci_enable_msix_exact() return zero in case of success, which indicates MSI or MSI-X interrupts have been successfully allocated. Signed-off-by: Alexander Gordeev Signed-off-by: Bjorn Helgaas --- Documentation/PCI/MSI-HOWTO.txt | 86 ++++++++++++++++++++++++++++++++++++++++- include/linux/pci.h | 20 ++++++++++ 2 files changed, 104 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt index 96ee5eb9d4d4..10a93696e55a 100644 --- a/Documentation/PCI/MSI-HOWTO.txt +++ b/Documentation/PCI/MSI-HOWTO.txt @@ -159,6 +159,11 @@ static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec) return pci_enable_msi_range(pdev, nvec, nvec); } +Note, unlike pci_enable_msi_exact() function, which could be also used to +enable a particular number of MSI-X interrupts, pci_enable_msi_range() +returns either a negative errno or 'nvec' (not negative errno or 0 - as +pci_enable_msi_exact() does). + 4.2.1.3 Single MSI mode The most notorious example of the request type described above is @@ -175,7 +180,22 @@ enable the single MSI mode, pci_enable_msi_range() returns either a negative errno or 1 (not negative errno or 0 - as pci_enable_msi() does). -4.2.3 pci_disable_msi +4.2.3 pci_enable_msi_exact + +int pci_enable_msi_exact(struct pci_dev *dev, int nvec) + +This variation on pci_enable_msi_range() call allows a device driver to +request exactly 'nvec' MSIs. + +If this function returns a negative number, it indicates an error and +the driver should not attempt to request any more MSI interrupts for +this device. + +By contrast with pci_enable_msi_range() function, pci_enable_msi_exact() +returns zero in case of success, which indicates MSI interrupts have been +successfully allocated. + +4.2.4 pci_disable_msi void pci_disable_msi(struct pci_dev *dev) @@ -303,6 +323,11 @@ static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec) nvec, nvec); } +Note, unlike pci_enable_msix_exact() function, which could be also used to +enable a particular number of MSI-X interrupts, pci_enable_msix_range() +returns either a negative errno or 'nvec' (not negative errno or 0 - as +pci_enable_msix_exact() does). + 4.3.1.3 Specific requirements to the number of MSI-X interrupts As noted above, there could be devices that can not operate with just any @@ -349,7 +374,64 @@ Note how pci_enable_msix_range() return value is analized for a fallback - any error code other than -ENOSPC indicates a fatal error and should not be retried. -4.3.2 pci_disable_msix +4.3.2 pci_enable_msix_exact + +int pci_enable_msix_exact(struct pci_dev *dev, + struct msix_entry *entries, int nvec) + +This variation on pci_enable_msix_range() call allows a device driver to +request exactly 'nvec' MSI-Xs. + +If this function returns a negative number, it indicates an error and +the driver should not attempt to allocate any more MSI-X interrupts for +this device. + +By contrast with pci_enable_msix_range() function, pci_enable_msix_exact() +returns zero in case of success, which indicates MSI-X interrupts have been +successfully allocated. + +Another version of a routine that enables MSI-X mode for a device with +specific requirements described in chapter 4.3.1.3 might look like this: + +/* + * Assume 'minvec' and 'maxvec' are non-zero + */ +static int foo_driver_enable_msix(struct foo_adapter *adapter, + int minvec, int maxvec) +{ + int rc; + + minvec = roundup_pow_of_two(minvec); + maxvec = rounddown_pow_of_two(maxvec); + + if (minvec > maxvec) + return -ERANGE; + +retry: + rc = pci_enable_msix_exact(adapter->pdev, + adapter->msix_entries, maxvec); + + /* + * -ENOSPC is the only error code allowed to be analyzed + */ + if (rc == -ENOSPC) { + if (maxvec == 1) + return -ENOSPC; + + maxvec /= 2; + + if (minvec > maxvec) + return -ENOSPC; + + goto retry; + } else if (rc < 0) { + return rc; + } + + return maxvec; +} + +4.3.3 pci_disable_msix void pci_disable_msix(struct pci_dev *dev) diff --git a/include/linux/pci.h b/include/linux/pci.h index fb57c892b214..33aa2caf0f0c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1169,8 +1169,23 @@ void msi_remove_pci_irq_vectors(struct pci_dev *dev); void pci_restore_msi_state(struct pci_dev *dev); int pci_msi_enabled(void); int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec); +static inline int pci_enable_msi_exact(struct pci_dev *dev, int nvec) +{ + int rc = pci_enable_msi_range(dev, nvec, nvec); + if (rc < 0) + return rc; + return 0; +} int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, int minvec, int maxvec); +static inline int pci_enable_msix_exact(struct pci_dev *dev, + struct msix_entry *entries, int nvec) +{ + int rc = pci_enable_msix_range(dev, entries, nvec, nvec); + if (rc < 0) + return rc; + return 0; +} #else static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; } static inline int pci_enable_msi_block(struct pci_dev *dev, int nvec) @@ -1189,9 +1204,14 @@ static inline int pci_msi_enabled(void) { return 0; } static inline int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec) { return -ENOSYS; } +static inline int pci_enable_msi_exact(struct pci_dev *dev, int nvec) +{ return -ENOSYS; } static inline int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, int minvec, int maxvec) { return -ENOSYS; } +static inline int pci_enable_msix_exact(struct pci_dev *dev, + struct msix_entry *entries, int nvec) +{ return -ENOSYS; } #endif #ifdef CONFIG_PCIEPORTBUS -- cgit v1.2.3 From d206940319c41df4299db75ed56142177bb2e5f6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Feb 2014 23:09:11 +0100 Subject: net: core: introduce netif_skb_dev_features Will be used by upcoming ipv4 forward path change that needs to determine feature mask using skb->dst->dev instead of skb->dev. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 ++++++- net/core/dev.c | 22 ++++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 440a02ee6f92..21d4e6be8949 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3068,7 +3068,12 @@ void netdev_change_features(struct net_device *dev); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); -netdev_features_t netif_skb_features(struct sk_buff *skb); +netdev_features_t netif_skb_dev_features(struct sk_buff *skb, + const struct net_device *dev); +static inline netdev_features_t netif_skb_features(struct sk_buff *skb) +{ + return netif_skb_dev_features(skb, skb->dev); +} static inline bool net_gso_ok(netdev_features_t features, int gso_type) { diff --git a/net/core/dev.c b/net/core/dev.c index 4ad1b78c9c77..b1b0c8d4d7df 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2420,7 +2420,7 @@ EXPORT_SYMBOL(netdev_rx_csum_fault); * 2. No high memory really exists on this machine. */ -static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_HIGHMEM int i; @@ -2495,34 +2495,36 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) } static netdev_features_t harmonize_features(struct sk_buff *skb, - netdev_features_t features) + const struct net_device *dev, + netdev_features_t features) { if (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, skb_network_protocol(skb))) { features &= ~NETIF_F_ALL_CSUM; - } else if (illegal_highdma(skb->dev, skb)) { + } else if (illegal_highdma(dev, skb)) { features &= ~NETIF_F_SG; } return features; } -netdev_features_t netif_skb_features(struct sk_buff *skb) +netdev_features_t netif_skb_dev_features(struct sk_buff *skb, + const struct net_device *dev) { __be16 protocol = skb->protocol; - netdev_features_t features = skb->dev->features; + netdev_features_t features = dev->features; - if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs) features &= ~NETIF_F_GSO_MASK; if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } else if (!vlan_tx_tag_present(skb)) { - return harmonize_features(skb, features); + return harmonize_features(skb, dev, features); } - features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | + features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) @@ -2530,9 +2532,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - return harmonize_features(skb, features); + return harmonize_features(skb, dev, features); } -EXPORT_SYMBOL(netif_skb_features); +EXPORT_SYMBOL(netif_skb_dev_features); int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) -- cgit v1.2.3 From fe6cc55f3a9a053482a76f5a6b2257cee51b4663 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Feb 2014 23:09:12 +0100 Subject: net: ip, ipv6: handle gso skbs in forwarding path Marcelo Ricardo Leitner reported problems when the forwarding link path has a lower mtu than the incoming one if the inbound interface supports GRO. Given: Host R1 R2 Host sends tcp stream which is routed via R1 and R2. R1 performs GRO. In this case, the kernel will fail to send ICMP fragmentation needed messages (or pkt too big for ipv6), as GSO packets currently bypass dstmtu checks in forward path. Instead, Linux tries to send out packets exceeding the mtu. When locking route MTU on Host (i.e., no ipv4 DF bit set), R1 does not fragment the packets when forwarding, and again tries to send out packets exceeding R1-R2 link mtu. This alters the forwarding dstmtu checks to take the individual gso segment lengths into account. For ipv6, we send out pkt too big error for gso if the individual segments are too big. For ipv4, we either send icmp fragmentation needed, or, if the DF bit is not set, perform software segmentation and let the output path create fragments when the packet is leaving the machine. It is not 100% correct as the error message will contain the headers of the GRO skb instead of the original/segmented one, but it seems to work fine in my (limited) tests. Eric Dumazet suggested to simply shrink mss via ->gso_size to avoid sofware segmentation. However it turns out that skb_segment() assumes skb nr_frags is related to mss size so we would BUG there. I don't want to mess with it considering Herbert and Eric disagree on what the correct behavior should be. Hannes Frederic Sowa notes that when we would shrink gso_size skb_segment would then also need to deal with the case where SKB_MAX_FRAGS would be exceeded. This uses sofware segmentation in the forward path when we hit ipv4 non-DF packets and the outgoing link mtu is too small. Its not perfect, but given the lack of bug reports wrt. GRO fwd being broken this is a rare case anyway. Also its not like this could not be improved later once the dust settles. Acked-by: Herbert Xu Reported-by: Marcelo Ricardo Leitner Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 17 ++++++++++++ net/ipv4/ip_forward.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++-- net/ipv6/ip6_output.c | 17 ++++++++++-- 3 files changed, 101 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f589c9af8cbf..3ebbbe7b6d05 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2916,5 +2916,22 @@ static inline bool skb_head_is_locked(const struct sk_buff *skb) { return !skb->head_frag || skb_cloned(skb); } + +/** + * skb_gso_network_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_network_seglen is used to determine the real size of the + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). + * + * The MAC/L2 header is not accounted for. + */ +static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - + skb_network_header(skb); + return hdr_len + skb_gso_transport_seglen(skb); +} #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index e9f1217a8afd..f3869c186d97 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,6 +39,71 @@ #include #include +static bool ip_may_fragment(const struct sk_buff *skb) +{ + return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || + !skb->local_df; +} + +static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu || skb->local_df) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + +static bool ip_gso_exceeds_dst_mtu(const struct sk_buff *skb) +{ + unsigned int mtu; + + if (skb->local_df || !skb_is_gso(skb)) + return false; + + mtu = ip_dst_mtu_maybe_forward(skb_dst(skb), true); + + /* if seglen > mtu, do software segmentation for IP fragmentation on + * output. DF bit cannot be set since ip_forward would have sent + * icmp error. + */ + return skb_gso_network_seglen(skb) > mtu; +} + +/* called if GSO skb needs to be fragmented on forward */ +static int ip_forward_finish_gso(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + netdev_features_t features; + struct sk_buff *segs; + int ret = 0; + + features = netif_skb_dev_features(skb, dst->dev); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = dst_output(segs); + + if (err && ret == 0) + ret = err; + segs = nskb; + } while (segs); + + return ret; +} + static int ip_forward_finish(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); @@ -49,6 +114,9 @@ static int ip_forward_finish(struct sk_buff *skb) if (unlikely(opt->optlen)) ip_forward_options(skb); + if (ip_gso_exceeds_dst_mtu(skb)) + return ip_forward_finish_gso(skb); + return dst_output(skb); } @@ -91,8 +159,7 @@ int ip_forward(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); - if (unlikely(skb->len > mtu && !skb_is_gso(skb) && - (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ef02b26ccf81..070a2fae2375 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -342,6 +342,20 @@ static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) return mtu; } +static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu || skb->local_df) + return false; + + if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) + return true; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + int ip6_forward(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -466,8 +480,7 @@ int ip6_forward(struct sk_buff *skb) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) || - (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) { + if (ip6_pkt_too_big(skb, mtu)) { /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); -- cgit v1.2.3 From 6ecde51dd7894ffe2f959cca1fea3ea2b9ee2394 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 13 Feb 2014 20:45:17 -0800 Subject: mlx5: Add include of because of kzalloc()/kfree() use On some architectures (for example, arm), we don't end up indirectly pulling in the declaration of kzalloc() and kfree(), and so building anything that includes breaks. Fix this by adding an explicit include to get the declaration. Reported-by: kbuild test robot Signed-off-by: Roland Dreier --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 32cb18c399c2..130bc8d77fa5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -38,8 +38,10 @@ #include #include #include +#include #include #include + #include #include -- cgit v1.2.3 From fada94ee64e6e18793b1db60fb8278d2eddbf922 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 14 Feb 2014 10:52:57 +0800 Subject: workqueue: add args to workqueue lockdep name Tommi noticed a 'funny' lock class name: "%s#5" from a lock acquired in process_one_work(). Maybe #fmt plus #args could be used as the lock_name to give some more information for some fmt string like the above. __builtin_constant_p() check is removed (as there seems no good way to check all the variables in args list). However, by removing the check, it only adds two additional "s for those constants. Some lockdep name examples printed out after the change: lockdep name wq->name "events_long" events_long "%s"("khelper") khelper "xfs-data/%s"mp->m_fsname xfs-data/dm-3 Signed-off-by: Li Zhong Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 594521ba0d43..704f4f652d0a 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -419,10 +419,7 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, static struct lock_class_key __key; \ const char *__lock_name; \ \ - if (__builtin_constant_p(fmt)) \ - __lock_name = (fmt); \ - else \ - __lock_name = #fmt; \ + __lock_name = #fmt#args; \ \ __alloc_workqueue_key((fmt), (flags), (max_active), \ &__key, __lock_name, ##args); \ -- cgit v1.2.3 From 99932d4fc03a13bb3e94938fe25458fabc8f2fc3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 16 Feb 2014 15:55:20 +0100 Subject: netdevice: add queue selection fallback handler for ndo_select_queue Add a new argument for ndo_select_queue() callback that passes a fallback handler. This gets invoked through netdev_pick_tx(); fallback handler is currently __netdev_pick_tx() as most drivers invoke this function within their customized implementation in case for skbs that don't need any special handling. This fallback handler can then be replaced on other call-sites with different queue selection methods (e.g. in packet sockets, pktgen etc). This also has the nice side-effect that __netdev_pick_tx() is then only invoked from netdev_pick_tx() and export of that function to modules can be undone. Suggested-by: David S. Miller Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 4 ++-- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 6 +++--- drivers/net/ethernet/lantiq_etop.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 4 ++-- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h | 2 +- drivers/net/ethernet/tile/tilegx.c | 2 +- drivers/net/team/team.c | 2 +- drivers/net/tun.c | 2 +- drivers/net/wireless/mwifiex/main.c | 2 +- drivers/staging/bcm/Bcmnet.c | 2 +- drivers/staging/netlogic/xlr_net.c | 2 +- drivers/staging/rtl8188eu/os_dep/os_intfs.c | 2 +- include/linux/netdevice.h | 9 ++++++--- net/core/flow_dissector.c | 7 +++---- net/mac80211/iface.c | 6 ++++-- 17 files changed, 31 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 867664918715..1c6104d3501d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3707,7 +3707,7 @@ static inline int bond_slave_override(struct bonding *bond, static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { /* * This helper function exists to help dev_pick_tx get the correct diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index 9d7419e0390b..66c0df78c3ff 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -1873,7 +1873,7 @@ void bnx2x_netif_stop(struct bnx2x *bp, int disable_hw) } u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct bnx2x *bp = netdev_priv(dev); @@ -1895,7 +1895,7 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, } /* select a non-FCoE queue */ - return __netdev_pick_tx(dev, skb) % BNX2X_NUM_ETH_QUEUES(bp); + return fallback(dev, skb) % BNX2X_NUM_ETH_QUEUES(bp); } void bnx2x_set_num_queues(struct bnx2x *bp) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h index bfc58d488bb5..a89a40f88c25 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h @@ -496,7 +496,7 @@ int bnx2x_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos); /* select_queue callback */ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv); + void *accel_priv, select_queue_fallback_t fallback); static inline void bnx2x_update_rx_prod(struct bnx2x *bp, struct bnx2x_fastpath *fp, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 6d4ada72dfd0..18076c4178b4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6881,7 +6881,7 @@ static inline int ixgbe_maybe_stop_tx(struct ixgbe_ring *tx_ring, u16 size) } static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct ixgbe_fwd_adapter *fwd_adapter = accel_priv; #ifdef IXGBE_FCOE @@ -6907,7 +6907,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, if (adapter->flags & IXGBE_FLAG_FCOE_ENABLED) break; default: - return __netdev_pick_tx(dev, skb); + return fallback(dev, skb); } f = &adapter->ring_feature[RING_F_FCOE]; @@ -6920,7 +6920,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, return txq + f->offset; #else - return __netdev_pick_tx(dev, skb); + return fallback(dev, skb); #endif } diff --git a/drivers/net/ethernet/lantiq_etop.c b/drivers/net/ethernet/lantiq_etop.c index 8f9266c64c75..fd4b6aecf6ee 100644 --- a/drivers/net/ethernet/lantiq_etop.c +++ b/drivers/net/ethernet/lantiq_etop.c @@ -619,7 +619,7 @@ ltq_etop_set_multicast_list(struct net_device *dev) static u16 ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { /* we are currently only using the first queue */ return 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 8e8a7eb43a2c..13457032d15f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -629,7 +629,7 @@ static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct sk_buff *sk } u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct mlx4_en_priv *priv = netdev_priv(dev); u16 rings_p_up = priv->num_tx_rings_p_up; @@ -641,7 +641,7 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, if (vlan_tx_tag_present(skb)) up = vlan_tx_tag_get(skb) >> VLAN_PRIO_SHIFT; - return __netdev_pick_tx(dev, skb) % rings_p_up + up * rings_p_up; + return fallback(dev, skb) % rings_p_up + up * rings_p_up; } static void mlx4_bf_copy(void __iomem *dst, unsigned long *src, unsigned bytecnt) diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index 3af04c3f42ea..9ca223bc90fc 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -723,7 +723,7 @@ int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); void mlx4_en_tx_irq(struct mlx4_cq *mcq); u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv); + void *accel_priv, select_queue_fallback_t fallback); netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c index 023237a65720..17503da9f7a5 100644 --- a/drivers/net/ethernet/tile/tilegx.c +++ b/drivers/net/ethernet/tile/tilegx.c @@ -2071,7 +2071,7 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev) /* Return subqueue id on this core (one per core). */ static u16 tile_net_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { return smp_processor_id(); } diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 28407426fd6f..c8624a8235ab 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1648,7 +1648,7 @@ static netdev_tx_t team_xmit(struct sk_buff *skb, struct net_device *dev) } static u16 team_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { /* * This helper function exists to help dev_pick_tx get the correct diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 44c4db8450f0..8fe9cb7d0f72 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -366,7 +366,7 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) * hope the rxq no. may help here. */ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct tun_struct *tun = netdev_priv(dev); struct tun_flow_entry *e; diff --git a/drivers/net/wireless/mwifiex/main.c b/drivers/net/wireless/mwifiex/main.c index 4d79761b9c87..9d3d2758ec35 100644 --- a/drivers/net/wireless/mwifiex/main.c +++ b/drivers/net/wireless/mwifiex/main.c @@ -748,7 +748,7 @@ static struct net_device_stats *mwifiex_get_stats(struct net_device *dev) static u16 mwifiex_netdev_select_wmm_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { skb->priority = cfg80211_classify8021d(skb, NULL); return mwifiex_1d_to_wmm_queue[skb->priority]; diff --git a/drivers/staging/bcm/Bcmnet.c b/drivers/staging/bcm/Bcmnet.c index 8dfdd2732bdc..95a2358267ba 100644 --- a/drivers/staging/bcm/Bcmnet.c +++ b/drivers/staging/bcm/Bcmnet.c @@ -40,7 +40,7 @@ static INT bcm_close(struct net_device *dev) } static u16 bcm_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { return ClassifyPacket(netdev_priv(dev), skb); } diff --git a/drivers/staging/netlogic/xlr_net.c b/drivers/staging/netlogic/xlr_net.c index eedffed17e39..6f9ac27730af 100644 --- a/drivers/staging/netlogic/xlr_net.c +++ b/drivers/staging/netlogic/xlr_net.c @@ -307,7 +307,7 @@ static netdev_tx_t xlr_net_start_xmit(struct sk_buff *skb, } static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { return (u16)smp_processor_id(); } diff --git a/drivers/staging/rtl8188eu/os_dep/os_intfs.c b/drivers/staging/rtl8188eu/os_dep/os_intfs.c index 68f98fa114d2..7c9ee58f47bb 100644 --- a/drivers/staging/rtl8188eu/os_dep/os_intfs.c +++ b/drivers/staging/rtl8188eu/os_dep/os_intfs.c @@ -653,7 +653,7 @@ static unsigned int rtw_classify8021d(struct sk_buff *skb) } static u16 rtw_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, select_queue_fallback_t fallback) { struct adapter *padapter = rtw_netdev_priv(dev); struct mlme_priv *pmlmepriv = &padapter->mlmepriv; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 21d4e6be8949..1de9c136b066 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -752,6 +752,9 @@ struct netdev_phys_port_id { unsigned char id_len; }; +typedef u16 (*select_queue_fallback_t)(struct net_device *dev, + struct sk_buff *skb); + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -783,7 +786,7 @@ struct netdev_phys_port_id { * Required can not be NULL. * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - * void *accel_priv); + * void *accel_priv, select_queue_fallback_t fallback); * Called to decide which queue to when device supports multiple * transmit queues. * @@ -1005,7 +1008,8 @@ struct net_device_ops { struct net_device *dev); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - void *accel_priv); + void *accel_priv, + select_queue_fallback_t fallback); void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); @@ -1551,7 +1555,6 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, void *accel_priv); -u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb); /* * Net namespace inlines diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 87577d447554..75fe83f590ea 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -372,7 +372,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) #endif } -u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) +static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) { struct sock *sk = skb->sk; int queue_index = sk_tx_queue_get(sk); @@ -392,7 +392,6 @@ u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) return queue_index; } -EXPORT_SYMBOL(__netdev_pick_tx); struct netdev_queue *netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, @@ -403,8 +402,8 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, if (dev->real_num_tx_queues != 1) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) - queue_index = ops->ndo_select_queue(dev, skb, - accel_priv); + queue_index = ops->ndo_select_queue(dev, skb, accel_priv, + __netdev_pick_tx); else queue_index = __netdev_pick_tx(dev, skb); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index d6d1f1df9119..ce1c44370610 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1057,7 +1057,8 @@ static void ieee80211_uninit(struct net_device *dev) static u16 ieee80211_netdev_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, + select_queue_fallback_t fallback) { return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); } @@ -1075,7 +1076,8 @@ static const struct net_device_ops ieee80211_dataif_ops = { static u16 ieee80211_monitor_select_queue(struct net_device *dev, struct sk_buff *skb, - void *accel_priv) + void *accel_priv, + select_queue_fallback_t fallback) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; -- cgit v1.2.3 From b9507bdaf40e91fea2b1c0c1ee7dc627c8ee6fd6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 16 Feb 2014 15:55:21 +0100 Subject: netdevice: move netdev_cap_txqueue for shared usage to header In order to allow users to invoke netdev_cap_txqueue, it needs to be moved into netdevice.h header file. While at it, also add kernel doc header to document the API. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 20 ++++++++++++++++++++ net/core/flow_dissector.c | 13 +------------ 2 files changed, 21 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1de9c136b066..e8eeebd49a98 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2278,6 +2278,26 @@ static inline void netdev_reset_queue(struct net_device *dev_queue) netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0)); } +/** + * netdev_cap_txqueue - check if selected tx queue exceeds device queues + * @dev: network device + * @queue_index: given tx queue index + * + * Returns 0 if given tx queue index >= number of device tx queues, + * otherwise returns the originally passed tx queue index. + */ +static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index) +{ + if (unlikely(queue_index >= dev->real_num_tx_queues)) { + net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", + dev->name, queue_index, + dev->real_num_tx_queues); + return 0; + } + + return queue_index; +} + /** * netif_running - test if up * @dev: network device diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 75fe83f590ea..e29e810663d7 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -323,17 +323,6 @@ u32 __skb_get_poff(const struct sk_buff *skb) return poff; } -static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) -{ - if (unlikely(queue_index >= dev->real_num_tx_queues)) { - net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", - dev->name, queue_index, - dev->real_num_tx_queues); - return 0; - } - return queue_index; -} - static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_XPS @@ -408,7 +397,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, queue_index = __netdev_pick_tx(dev, skb); if (!accel_priv) - queue_index = dev_cap_txqueue(dev, queue_index); + queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); -- cgit v1.2.3 From bcdfeb2eb4e42b811950b9cd226109291051732a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 11 Feb 2014 13:04:19 +0800 Subject: ceph: remove xattr when null value is given to setxattr() For the setxattr request, introduce a new flag CEPH_XATTR_REMOVE to distinguish null value case from the zero-length value case. Signed-off-by: Yan, Zheng --- fs/ceph/xattr.c | 16 ++++++++++++++-- include/linux/ceph/ceph_fs.h | 5 +++-- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 28f9793b9167..231c02b16c0c 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -12,6 +12,9 @@ #define XATTR_CEPH_PREFIX "ceph." #define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1) +static int __remove_xattr(struct ceph_inode_info *ci, + struct ceph_inode_xattr *xattr); + /* * List of handlers for synthetic system.* attributes. Other * attributes are handled directly. @@ -359,6 +362,12 @@ static int __set_xattr(struct ceph_inode_info *ci, kfree(val); return err; } + if (update_xattr < 0) { + if (xattr) + __remove_xattr(ci, xattr); + kfree(name); + return 0; + } } if (!xattr) { @@ -862,6 +871,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, dout("setxattr value=%.*s\n", (int)size, value); + if (!value) + flags |= CEPH_XATTR_REMOVE; + /* do request */ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, USE_AUTH_MDS); @@ -965,8 +977,8 @@ retry: goto retry; } - err = __set_xattr(ci, newname, name_len, newval, - val_len, flags, 1, &xattr); + err = __set_xattr(ci, newname, name_len, newval, val_len, + flags, value ? 1 : -1, &xattr); if (!err) { dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 2623cffc73a1..25bfb0eff772 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -373,8 +373,9 @@ extern const char *ceph_mds_op_name(int op); /* * Ceph setxattr request flags. */ -#define CEPH_XATTR_CREATE 1 -#define CEPH_XATTR_REPLACE 2 +#define CEPH_XATTR_CREATE (1 << 0) +#define CEPH_XATTR_REPLACE (1 << 1) +#define CEPH_XATTR_REMOVE (1 << 31) union ceph_mds_request_args { struct { -- cgit v1.2.3 From 05fb7a56ad65ceaa1907bb9c66bfa1108847667f Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Thu, 23 Jan 2014 14:35:28 +0000 Subject: mfd: max8997: Naturalise cross-architecture discrepancies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we compile the MAX8997 for a 64bit architecture we receive the following warnings: drivers/mfd/max8997.c: In function ‘max8997_i2c_get_driver_data’: drivers/mfd/max8997.c:173:10: warning: cast from pointer to integer of different size return (int)match->data; ^ Signed-off-by: Lee Jones --- drivers/mfd/max8997.c | 6 +++--- include/linux/mfd/max8997-private.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/mfd/max8997.c b/drivers/mfd/max8997.c index be88a3bf7b85..5adede0fb04c 100644 --- a/drivers/mfd/max8997.c +++ b/drivers/mfd/max8997.c @@ -164,15 +164,15 @@ static struct max8997_platform_data *max8997_i2c_parse_dt_pdata( return pd; } -static inline int max8997_i2c_get_driver_data(struct i2c_client *i2c, +static inline unsigned long max8997_i2c_get_driver_data(struct i2c_client *i2c, const struct i2c_device_id *id) { if (IS_ENABLED(CONFIG_OF) && i2c->dev.of_node) { const struct of_device_id *match; match = of_match_node(max8997_pmic_dt_match, i2c->dev.of_node); - return (int)match->data; + return (unsigned long)match->data; } - return (int)id->driver_data; + return id->driver_data; } static int max8997_i2c_probe(struct i2c_client *i2c, diff --git a/include/linux/mfd/max8997-private.h b/include/linux/mfd/max8997-private.h index ad1ae7f345ad..78c76cd4d37b 100644 --- a/include/linux/mfd/max8997-private.h +++ b/include/linux/mfd/max8997-private.h @@ -387,7 +387,7 @@ struct max8997_dev { struct i2c_client *muic; /* slave addr 0x4a */ struct mutex iolock; - int type; + unsigned long type; struct platform_device *battery; /* battery control (not fuel gauge) */ int irq; -- cgit v1.2.3 From 8bace2d5b4baa0f60b6094b53aeb79515ec94e4a Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Mon, 3 Feb 2014 08:22:30 +0000 Subject: mfd: max8998: Naturalise cross-architecture discrepancies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we compile the MAX8998 for a 64bit architecture we receive the following warnings: drivers/mfd/max8998.c: In function ‘max8998_i2c_get_driver_data’: drivers/mfd/max8998.c:178:10: warning: cast from pointer to integer of different size return (int)match->data; ^ Signed-off-by: Lee Jones --- drivers/mfd/max8998.c | 6 +++--- include/linux/mfd/max8998-private.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/mfd/max8998.c b/drivers/mfd/max8998.c index 612ca404e150..5d5e186b5d8b 100644 --- a/drivers/mfd/max8998.c +++ b/drivers/mfd/max8998.c @@ -169,16 +169,16 @@ static struct max8998_platform_data *max8998_i2c_parse_dt_pdata( return pd; } -static inline int max8998_i2c_get_driver_data(struct i2c_client *i2c, +static inline unsigned long max8998_i2c_get_driver_data(struct i2c_client *i2c, const struct i2c_device_id *id) { if (IS_ENABLED(CONFIG_OF) && i2c->dev.of_node) { const struct of_device_id *match; match = of_match_node(max8998_dt_match, i2c->dev.of_node); - return (int)(long)match->data; + return (unsigned long)match->data; } - return (int)id->driver_data; + return id->driver_data; } static int max8998_i2c_probe(struct i2c_client *i2c, diff --git a/include/linux/mfd/max8998-private.h b/include/linux/mfd/max8998-private.h index 4ecb24b4b863..d68ada502ff3 100644 --- a/include/linux/mfd/max8998-private.h +++ b/include/linux/mfd/max8998-private.h @@ -163,7 +163,7 @@ struct max8998_dev { int ono; u8 irq_masks_cur[MAX8998_NUM_IRQ_REGS]; u8 irq_masks_cache[MAX8998_NUM_IRQ_REGS]; - int type; + unsigned long type; bool wakeup; }; -- cgit v1.2.3 From 5c6fbd56d16f38fddec629e1dccdeee5ad7f5a42 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Mon, 3 Feb 2014 08:24:20 +0000 Subject: mfd: tps65217: Naturalise cross-architecture discrepancies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we compile the TPS65217 for a 64bit architecture we receive the following warnings: drivers/mfd/tps65217.c: In function ‘tps65217_probe’: drivers/mfd/tps65217.c:173:13: warning: cast from pointer to integer of different size chip_id = (unsigned int)match->data; ^ Signed-off-by: Lee Jones --- drivers/mfd/tps65217.c | 4 ++-- include/linux/mfd/tps65217.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/mfd/tps65217.c b/drivers/mfd/tps65217.c index 966cf65c5c36..3cc4c7084b92 100644 --- a/drivers/mfd/tps65217.c +++ b/drivers/mfd/tps65217.c @@ -158,7 +158,7 @@ static int tps65217_probe(struct i2c_client *client, { struct tps65217 *tps; unsigned int version; - unsigned int chip_id = ids->driver_data; + unsigned long chip_id = ids->driver_data; const struct of_device_id *match; bool status_off = false; int ret; @@ -170,7 +170,7 @@ static int tps65217_probe(struct i2c_client *client, "Failed to find matching dt id\n"); return -EINVAL; } - chip_id = (unsigned int)(unsigned long)match->data; + chip_id = (unsigned long)match->data; status_off = of_property_read_bool(client->dev.of_node, "ti,pmic-shutdown-controller"); } diff --git a/include/linux/mfd/tps65217.h b/include/linux/mfd/tps65217.h index a5a7f0130e96..54b5458ec084 100644 --- a/include/linux/mfd/tps65217.h +++ b/include/linux/mfd/tps65217.h @@ -252,7 +252,7 @@ struct tps65217_board { struct tps65217 { struct device *dev; struct tps65217_board *pdata; - unsigned int id; + unsigned long id; struct regulator_desc desc[TPS65217_NUM_REGULATOR]; struct regulator_dev *rdev[TPS65217_NUM_REGULATOR]; struct regmap *regmap; @@ -263,7 +263,7 @@ static inline struct tps65217 *dev_to_tps65217(struct device *dev) return dev_get_drvdata(dev); } -static inline int tps65217_chip_id(struct tps65217 *tps65217) +static inline unsigned long tps65217_chip_id(struct tps65217 *tps65217) { return tps65217->id; } -- cgit v1.2.3 From 6d35ab48090b10c5ea5604ed5d6e91f302dc6060 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Feb 2014 17:19:29 +0100 Subject: sched: Add 'flags' argument to sched_{set,get}attr() syscalls Because of a recent syscall design debate; its deemed appropriate for each syscall to have a flags argument for future extension; without immediately requiring new syscalls. Cc: juri.lelli@gmail.com Cc: Ingo Molnar Suggested-by: Michael Kerrisk Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140214161929.GL27965@twins.programming.kicks-ass.net Signed-off-by: Thomas Gleixner --- include/linux/syscalls.h | 6 ++++-- kernel/sched/core.c | 11 ++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 40ed9e9a77e5..a747a77ea584 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -281,13 +281,15 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param); asmlinkage long sys_sched_setattr(pid_t pid, - struct sched_attr __user *attr); + struct sched_attr __user *attr, + unsigned int flags); asmlinkage long sys_sched_getscheduler(pid_t pid); asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param); asmlinkage long sys_sched_getattr(pid_t pid, struct sched_attr __user *attr, - unsigned int size); + unsigned int size, + unsigned int flags); asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr); asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a6e7470166c7..6edbef296ece 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3661,13 +3661,14 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) * @pid: the pid in question. * @uattr: structure containing the extended parameters. */ -SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr) +SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, flags) { struct sched_attr attr; struct task_struct *p; int retval; - if (!uattr || pid < 0) + if (!uattr || pid < 0 || flags) return -EINVAL; if (sched_copy_attr(uattr, &attr)) @@ -3804,8 +3805,8 @@ err_size: * @uattr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. */ -SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, - unsigned int, size) +SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + unsigned int, size, unsigned int, flags) { struct sched_attr attr = { .size = sizeof(struct sched_attr), @@ -3814,7 +3815,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, int retval; if (!uattr || pid < 0 || size > PAGE_SIZE || - size < SCHED_ATTR_SIZE_VER0) + size < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; rcu_read_lock(); -- cgit v1.2.3 From 37f204164dfb0186a0caf20bc3e3120080bcd788 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Sat, 1 Mar 2014 11:56:05 +0100 Subject: PM: Add pm_runtime_suspend|resume_force functions This patch provides two new runtime PM helper functions which intend to be used from system suspend/resume callbacks, to make sure devices are put into low power state during system suspend and brought back to full power at system resume. The prerequisite is to have all levels of a device's runtime PM callbacks to be defined through the SET_PM_RUNTIME_PM_OPS macro, which means these are available for CONFIG_PM. By using the new runtime PM helper functions especially the two scenarios below will be addressed. 1) The PM core prevents .runtime_suspend callbacks from being invoked during system suspend. That means even for a runtime PM centric subsystem and driver, the device needs to be put into low power state from a system suspend callback. Otherwise it may very well be left in full power state (runtime resumed) while the system is suspended. By using the new helper functions, we make sure to walk the hierarchy of a device's power domain, subsystem and driver. 2) Subsystems and drivers need to cope with all the combinations of CONFIG_PM_SLEEP and CONFIG_PM_RUNTIME. The two new helper functions smothly addresses this. Signed-off-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki --- drivers/base/power/Makefile | 3 +- drivers/base/power/runtime.c | 84 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/pm_runtime.h | 4 +++ 3 files changed, 89 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile index 2e58ebb1f6c0..1cb8544598d5 100644 --- a/drivers/base/power/Makefile +++ b/drivers/base/power/Makefile @@ -1,6 +1,5 @@ -obj-$(CONFIG_PM) += sysfs.o generic_ops.o common.o qos.o +obj-$(CONFIG_PM) += sysfs.o generic_ops.o common.o qos.o runtime.o obj-$(CONFIG_PM_SLEEP) += main.o wakeup.o -obj-$(CONFIG_PM_RUNTIME) += runtime.o obj-$(CONFIG_PM_TRACE_RTC) += trace.o obj-$(CONFIG_PM_OPP) += opp.o obj-$(CONFIG_PM_GENERIC_DOMAINS) += domain.o domain_governor.o diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index ac495b1357fa..4776cf528d08 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -44,6 +44,7 @@ static int (*rpm_get_resume_cb(struct device *dev))(struct device *) return RPM_GET_CALLBACK(dev, runtime_resume); } +#ifdef CONFIG_PM_RUNTIME static int (*rpm_get_idle_cb(struct device *dev))(struct device *) { return RPM_GET_CALLBACK(dev, runtime_idle); @@ -1401,3 +1402,86 @@ void pm_runtime_remove(struct device *dev) if (dev->power.irq_safe && dev->parent) pm_runtime_put(dev->parent); } +#endif + +/** + * pm_runtime_force_suspend - Force a device into suspend state if needed. + * @dev: Device to suspend. + * + * Disable runtime PM so we safely can check the device's runtime PM status and + * if it is active, invoke it's .runtime_suspend callback to bring it into + * suspend state. Keep runtime PM disabled to preserve the state unless we + * encounter errors. + * + * Typically this function may be invoked from a system suspend callback to make + * sure the device is put into low power state. + */ +int pm_runtime_force_suspend(struct device *dev) +{ + int (*callback)(struct device *); + int ret = 0; + + pm_runtime_disable(dev); + + /* + * Note that pm_runtime_status_suspended() returns false while + * !CONFIG_PM_RUNTIME, which means the device will be put into low + * power state. + */ + if (pm_runtime_status_suspended(dev)) + return 0; + + callback = rpm_get_suspend_cb(dev); + + if (!callback) { + ret = -ENOSYS; + goto err; + } + + ret = callback(dev); + if (ret) + goto err; + + pm_runtime_set_suspended(dev); + return 0; +err: + pm_runtime_enable(dev); + return ret; +} +EXPORT_SYMBOL_GPL(pm_runtime_force_suspend); + +/** + * pm_runtime_force_resume - Force a device into resume state. + * @dev: Device to resume. + * + * Prior invoking this function we expect the user to have brought the device + * into low power state by a call to pm_runtime_force_suspend(). Here we reverse + * those actions and brings the device into full power. We update the runtime PM + * status and re-enables runtime PM. + * + * Typically this function may be invoked from a system resume callback to make + * sure the device is put into full power state. + */ +int pm_runtime_force_resume(struct device *dev) +{ + int (*callback)(struct device *); + int ret = 0; + + callback = rpm_get_resume_cb(dev); + + if (!callback) { + ret = -ENOSYS; + goto out; + } + + ret = callback(dev); + if (ret) + goto out; + + pm_runtime_set_active(dev); + pm_runtime_mark_last_busy(dev); +out: + pm_runtime_enable(dev); + return ret; +} +EXPORT_SYMBOL_GPL(pm_runtime_force_resume); diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 16c9a62fa1c0..2a5897a4afbc 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -26,9 +26,13 @@ #ifdef CONFIG_PM extern int pm_generic_runtime_suspend(struct device *dev); extern int pm_generic_runtime_resume(struct device *dev); +extern int pm_runtime_force_suspend(struct device *dev); +extern int pm_runtime_force_resume(struct device *dev); #else static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; } static inline int pm_generic_runtime_resume(struct device *dev) { return 0; } +static inline int pm_runtime_force_suspend(struct device *dev) { return 0; } +static inline int pm_runtime_force_resume(struct device *dev) { return 0; } #endif #ifdef CONFIG_PM_RUNTIME -- cgit v1.2.3