From 1d91c1d2c80cb70e2e553845e278b87a960c04da Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 16 Feb 2018 13:20:42 -0800 Subject: nospec: Kill array_index_nospec_mask_check() There are multiple problems with the dynamic sanity checking in array_index_nospec_mask_check(): * It causes unnecessary overhead in the 32-bit case since integer sized @index values will no longer cause the check to be compiled away like in the 64-bit case. * In the 32-bit case it may trigger with user controllable input when the expectation is that should only trigger during development of new kernel enabling. * The macro reuses the input parameter in multiple locations which is broken if someone passes an expression like 'index++' to array_index_nospec(). Reported-by: Linus Torvalds Signed-off-by: Dan Williams Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: David Woodhouse Cc: Greg Kroah-Hartman Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arch@vger.kernel.org Link: http://lkml.kernel.org/r/151881604278.17395.6605847763178076520.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar --- include/linux/nospec.h | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nospec.h b/include/linux/nospec.h index fbc98e2c8228..d6701e34424f 100644 --- a/include/linux/nospec.h +++ b/include/linux/nospec.h @@ -29,26 +29,6 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, } #endif -/* - * Warn developers about inappropriate array_index_nospec() usage. - * - * Even if the CPU speculates past the WARN_ONCE branch, the - * sign bit of @index is taken into account when generating the - * mask. - * - * This warning is compiled out when the compiler can infer that - * @index and @size are less than LONG_MAX. - */ -#define array_index_mask_nospec_check(index, size) \ -({ \ - if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX, \ - "array_index_nospec() limited to range of [0, LONG_MAX]\n")) \ - _mask = 0; \ - else \ - _mask = array_index_mask_nospec(index, size); \ - _mask; \ -}) - /* * array_index_nospec - sanitize an array index after a bounds check * @@ -67,7 +47,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, ({ \ typeof(index) _i = (index); \ typeof(size) _s = (size); \ - unsigned long _mask = array_index_mask_nospec_check(_i, _s); \ + unsigned long _mask = array_index_mask_nospec(_i, _s); \ \ BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ -- cgit v1.2.3 From b98c6a160a057d5686a8c54c79cc6c8c94a7d0c8 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 16 Feb 2018 13:20:48 -0800 Subject: nospec: Allow index argument to have const-qualified type The last expression in a statement expression need not be a bare variable, quoting gcc docs The last thing in the compound statement should be an expression followed by a semicolon; the value of this subexpression serves as the value of the entire construct. and we already use that in e.g. the min/max macros which end with a ternary expression. This way, we can allow index to have const-qualified type, which will in some cases avoid the need for introducing a local copy of index of non-const qualified type. That, in turn, can prevent readers not familiar with the internals of array_index_nospec from wondering about the seemingly redundant extra variable, and I think that's worthwhile considering how confusing the whole _nospec business is. The expression _i&_mask has type unsigned long (since that is the type of _mask, and the BUILD_BUG_ONs guarantee that _i will get promoted to that), so in order not to change the type of the whole expression, add a cast back to typeof(_i). Signed-off-by: Rasmus Villemoes Signed-off-by: Dan Williams Acked-by: Linus Torvalds Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: David Woodhouse Cc: Greg Kroah-Hartman Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arch@vger.kernel.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/151881604837.17395.10812767547837568328.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar --- include/linux/nospec.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nospec.h b/include/linux/nospec.h index d6701e34424f..172a19dc35ab 100644 --- a/include/linux/nospec.h +++ b/include/linux/nospec.h @@ -52,7 +52,6 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \ BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \ \ - _i &= _mask; \ - _i; \ + (typeof(_i)) (_i & _mask); \ }) #endif /* _LINUX_NOSPEC_H */ -- cgit v1.2.3 From eb6174f6d1be16b19cfa43dac296bfed003ce1a6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 16 Feb 2018 13:20:54 -0800 Subject: nospec: Include dependency The nospec.h header expects the per-architecture header file to optionally define array_index_mask_nospec(). Include that dependency to prevent inadvertent fallback to the default array_index_mask_nospec() implementation. The default implementation may not provide a full mitigation on architectures that perform data value speculation. Reported-by: Christian Borntraeger Signed-off-by: Dan Williams Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dave Hansen Cc: David Woodhouse Cc: Greg Kroah-Hartman Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arch@vger.kernel.org Link: http://lkml.kernel.org/r/151881605404.17395.1341935530792574707.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar --- include/linux/nospec.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nospec.h b/include/linux/nospec.h index 172a19dc35ab..e791ebc65c9c 100644 --- a/include/linux/nospec.h +++ b/include/linux/nospec.h @@ -5,6 +5,7 @@ #ifndef _LINUX_NOSPEC_H #define _LINUX_NOSPEC_H +#include /** * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise -- cgit v1.2.3 From 87358710c1fb4f1bf96bbe2349975ff9953fc9b2 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 19 Feb 2018 10:50:57 +0000 Subject: x86/retpoline: Support retpoline builds with Clang Signed-off-by: David Woodhouse Reviewed-by: Thomas Gleixner Cc: Linus Torvalds Cc: Peter Zijlstra Cc: arjan.van.de.ven@intel.com Cc: bp@alien8.de Cc: dave.hansen@intel.com Cc: jmattson@google.com Cc: karahmed@amazon.de Cc: kvm@vger.kernel.org Cc: pbonzini@redhat.com Cc: rkrcmar@redhat.com Link: http://lkml.kernel.org/r/1519037457-7643-5-git-send-email-dwmw@amazon.co.uk Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 5 ++++- include/linux/compiler-clang.h | 5 +++++ include/linux/compiler-gcc.h | 4 ++++ include/linux/init.h | 8 ++++---- 4 files changed, 17 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index fad55160dcb9..dbc7d0ed2eaa 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -232,7 +232,10 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # Avoid indirect branches in kernel to deal with Spectre ifdef CONFIG_RETPOLINE - RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) + RETPOLINE_CFLAGS_GCC := -mindirect-branch=thunk-extern -mindirect-branch-register + RETPOLINE_CFLAGS_CLANG := -mretpoline-external-thunk + + RETPOLINE_CFLAGS += $(call cc-option,$(RETPOLINE_CFLAGS_GCC),$(call cc-option,$(RETPOLINE_CFLAGS_CLANG))) ifneq ($(RETPOLINE_CFLAGS),) KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE endif diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index d02a4df3f473..d3f264a5b04d 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -27,3 +27,8 @@ #if __has_feature(address_sanitizer) #define __SANITIZE_ADDRESS__ #endif + +/* Clang doesn't have a way to turn it off per-function, yet. */ +#ifdef __noretpoline +#undef __noretpoline +#endif diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 73bc63e0a1c4..673fbf904fe5 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -93,6 +93,10 @@ #define __weak __attribute__((weak)) #define __alias(symbol) __attribute__((alias(#symbol))) +#ifdef RETPOLINE +#define __noretpoline __attribute__((indirect_branch("keep"))) +#endif + /* * it doesn't make sense on ARM (currently the only user of __naked) * to trace naked functions because then mcount is called without diff --git a/include/linux/init.h b/include/linux/init.h index 506a98151131..bc27cf03c41e 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -6,10 +6,10 @@ #include /* Built-in __init functions needn't be compiled with retpoline */ -#if defined(RETPOLINE) && !defined(MODULE) -#define __noretpoline __attribute__((indirect_branch("keep"))) +#if defined(__noretpoline) && !defined(MODULE) +#define __noinitretpoline __noretpoline #else -#define __noretpoline +#define __noinitretpoline #endif /* These macros are used to mark some functions or @@ -47,7 +47,7 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(.init.text) __cold __latent_entropy __noretpoline +#define __init __section(.init.text) __cold __latent_entropy __noinitretpoline #define __initdata __section(.init.data) #define __initconst __section(.init.rodata) #define __exitdata __section(.exit.data) -- cgit v1.2.3 From 33352244706369ea6736781ae41fe41692eb69bb Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 20 Feb 2018 11:37:51 -0600 Subject: jump_label: Explicitly disable jump labels in __init code After initmem has been freed, any jump labels in __init code are prevented from being written to by the kernel_text_address() check in __jump_label_update(). However, this check is quite broad. If kernel_text_address() were to return false for any other reason, the jump label write would fail silently with no warning. For jump labels in module init code, entry->code is set to zero to indicate that the entry is disabled. Do the same thing for core kernel init code. This makes the behavior more consistent, and will also make it more straightforward to detect non-init jump label write failures in the next patch. Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra Cc: Borislav Petkov Cc: Jason Baron Cc: Linus Torvalds Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c52825c73f3a174e8398b6898284ec20d4deb126.1519051220.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 3 +++ init/main.c | 2 ++ kernel/jump_label.c | 16 ++++++++++++++++ 3 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index b6a29c126cc4..2168cc6b8b30 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -151,6 +151,7 @@ extern struct jump_entry __start___jump_table[]; extern struct jump_entry __stop___jump_table[]; extern void jump_label_init(void); +extern void jump_label_invalidate_init(void); extern void jump_label_lock(void); extern void jump_label_unlock(void); extern void arch_jump_label_transform(struct jump_entry *entry, @@ -198,6 +199,8 @@ static __always_inline void jump_label_init(void) static_key_initialized = true; } +static inline void jump_label_invalidate_init(void) {} + static __always_inline bool static_key_false(struct static_key *key) { if (unlikely(static_key_count(key) > 0)) diff --git a/init/main.c b/init/main.c index a8100b954839..969eaf140ef0 100644 --- a/init/main.c +++ b/init/main.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include @@ -1000,6 +1001,7 @@ static int __ref kernel_init(void *unused) /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); ftrace_free_init_mem(); + jump_label_invalidate_init(); free_initmem(); mark_readonly(); system_state = SYSTEM_RUNNING; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index b4517095db6a..b71776576a66 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef HAVE_JUMP_LABEL @@ -417,6 +418,20 @@ void __init jump_label_init(void) cpus_read_unlock(); } +/* Disable any jump label entries in __init code */ +void __init jump_label_invalidate_init(void) +{ + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __stop___jump_table; + struct jump_entry *iter; + + for (iter = iter_start; iter < iter_stop; iter++) { + if (iter->code >= (unsigned long)_sinittext && + iter->code < (unsigned long)_einittext) + iter->code = 0; + } +} + #ifdef CONFIG_MODULES static enum jump_label_type jump_label_init_type(struct jump_entry *entry) @@ -633,6 +648,7 @@ static void jump_label_del_module(struct module *mod) } } +/* Disable any jump label entries in module init code */ static void jump_label_invalidate_module_init(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; -- cgit v1.2.3 From 9fbcc57aa16424ef84cb54e0d9db3221763de88a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 20 Feb 2018 11:37:53 -0600 Subject: extable: Make init_kernel_text() global Convert init_kernel_text() to a global function and use it in a few places instead of manually comparing _sinittext and _einittext. Note that kallsyms.h has a very similar function called is_kernel_inittext(), but its end check is inclusive. I'm not sure whether that's intentional behavior, so I didn't touch it. Suggested-by: Jason Baron Signed-off-by: Josh Poimboeuf Acked-by: Peter Zijlstra Acked-by: Steven Rostedt (VMware) Cc: Borislav Petkov Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/4335d02be8d45ca7d265d2f174251d0b7ee6c5fd.1519051220.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_orc.c | 3 +-- include/linux/kernel.h | 1 + kernel/extable.c | 2 +- kernel/jump_label.c | 4 +--- 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 1f9188f5357c..feb28fee6cea 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -5,7 +5,6 @@ #include #include #include -#include #define orc_warn(fmt, ...) \ printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__) @@ -148,7 +147,7 @@ static struct orc_entry *orc_find(unsigned long ip) } /* vmlinux .init slow lookup: */ - if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext) + if (init_kernel_text(ip)) return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index ce51455e2adf..3fd291503576 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -472,6 +472,7 @@ extern bool parse_option_str(const char *str, const char *option); extern char *next_arg(char *args, char **param, char **val); extern int core_kernel_text(unsigned long addr); +extern int init_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); diff --git a/kernel/extable.c b/kernel/extable.c index a17fdb63dc3e..6a5b61ebc66c 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -64,7 +64,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) return e; } -static inline int init_kernel_text(unsigned long addr) +int init_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_sinittext && addr < (unsigned long)_einittext) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index b2f0b479191b..52a0a7af8640 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -16,7 +16,6 @@ #include #include #include -#include #ifdef HAVE_JUMP_LABEL @@ -429,8 +428,7 @@ void __init jump_label_invalidate_init(void) struct jump_entry *iter; for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->code >= (unsigned long)_sinittext && - iter->code < (unsigned long)_einittext) + if (init_kernel_text(iter->code)) iter->code = 0; } } -- cgit v1.2.3 From 076467490b8176eb96eddc548a14d4135c7b5852 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 22 Feb 2018 13:05:41 +0100 Subject: kvm: fix warning for CONFIG_HAVE_KVM_EVENTFD builds Move the kvm_arch_irq_routing_update() prototype outside of ifdef CONFIG_HAVE_KVM_EVENTFD guards to fix the following sparse warning: arch/s390/kvm/../../../virt/kvm/irqchip.c:171:28: warning: symbol 'kvm_arch_irq_routing_update' was not declared. Should it be static? Signed-off-by: Sebastian Ott Acked-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ac0062b74aed..84b9c50693f2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1105,7 +1105,6 @@ static inline void kvm_irq_routing_update(struct kvm *kvm) { } #endif -void kvm_arch_irq_routing_update(struct kvm *kvm); static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { @@ -1114,6 +1113,8 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) #endif /* CONFIG_HAVE_KVM_EVENTFD */ +void kvm_arch_irq_routing_update(struct kvm *kvm); + static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) { /* -- cgit v1.2.3 From f75e4924f0152be747bf04c9d16bb23fd8baf5f9 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 22 Feb 2018 13:04:39 +0100 Subject: kvm: fix warning for non-x86 builds Fix the following sparse warning by moving the prototype of kvm_arch_mmu_notifier_invalidate_range() to linux/kvm_host.h . CHECK arch/s390/kvm/../../../virt/kvm/kvm_main.c arch/s390/kvm/../../../virt/kvm/kvm_main.c:138:13: warning: symbol 'kvm_arch_mmu_notifier_invalidate_range' was not declared. Should it be static? Signed-off-by: Sebastian Ott Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 --- include/linux/kvm_host.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd6f57a54a26..0a9e330b34f0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1464,7 +1464,4 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define put_smstate(type, buf, offset, val) \ *(type *)((buf) + (offset) - 0x7e00) = val -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end); - #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 84b9c50693f2..6930c63126c7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1273,4 +1273,7 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, } #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end); + #endif -- cgit v1.2.3 From 3079c22ea815775837a4f389ce2f7e1e7b202e09 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 26 Feb 2018 13:01:38 +0100 Subject: genhd: Rename get_disk() to get_disk_and_module() Rename get_disk() to get_disk_and_module() to make sure what the function does. It's not a great name but at least it is now clear that put_disk() is not it's counterpart. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/genhd.c | 10 ++++------ drivers/block/amiflop.c | 2 +- drivers/block/ataflop.c | 2 +- drivers/block/brd.c | 2 +- drivers/block/floppy.c | 2 +- drivers/block/loop.c | 2 +- drivers/block/swim.c | 2 +- drivers/block/z2ram.c | 2 +- drivers/ide/ide-probe.c | 2 +- include/linux/genhd.h | 2 +- 10 files changed, 13 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index 5098bffe6ba6..21b2843b27d0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -547,7 +547,7 @@ static int exact_lock(dev_t devt, void *data) { struct gendisk *p = data; - if (!get_disk(p)) + if (!get_disk_and_module(p)) return -1; return 0; } @@ -809,7 +809,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) spin_lock_bh(&ext_devt_lock); part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - if (part && get_disk(part_to_disk(part))) { + if (part && get_disk_and_module(part_to_disk(part))) { *partno = part->partno; disk = part_to_disk(part); } @@ -1456,7 +1456,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) } EXPORT_SYMBOL(__alloc_disk_node); -struct kobject *get_disk(struct gendisk *disk) +struct kobject *get_disk_and_module(struct gendisk *disk) { struct module *owner; struct kobject *kobj; @@ -1474,15 +1474,13 @@ struct kobject *get_disk(struct gendisk *disk) return kobj; } - -EXPORT_SYMBOL(get_disk); +EXPORT_SYMBOL(get_disk_and_module); void put_disk(struct gendisk *disk) { if (disk) kobject_put(&disk_to_dev(disk)->kobj); } - EXPORT_SYMBOL(put_disk); static void set_disk_ro_uevent(struct gendisk *gd, int ro) diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index e5aa62fcf5a8..3aaf6af3ec23 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1758,7 +1758,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data) if (unit[drive].type->code == FD_NODRIVE) return NULL; *part = 0; - return get_disk(unit[drive].gendisk); + return get_disk_and_module(unit[drive].gendisk); } static int __init amiga_floppy_probe(struct platform_device *pdev) diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 8bc3b9fd8dd2..dfb2c2622e5a 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1917,7 +1917,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data) if (drive >= FD_MAX_UNITS || type > NUM_DISK_MINORS) return NULL; *part = 0; - return get_disk(unit[drive].disk); + return get_disk_and_module(unit[drive].disk); } static int __init atari_floppy_init (void) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 8028a3a7e7fd..deea78e485da 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -456,7 +456,7 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data) mutex_lock(&brd_devices_mutex); brd = brd_init_one(MINOR(dev) / max_part, &new); - kobj = brd ? get_disk(brd->brd_disk) : NULL; + kobj = brd ? get_disk_and_module(brd->brd_disk) : NULL; mutex_unlock(&brd_devices_mutex); if (new) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index eae484acfbbc..8ec7235fc93b 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4505,7 +4505,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data) if (((*part >> 2) & 0x1f) >= ARRAY_SIZE(floppy_type)) return NULL; *part = 0; - return get_disk(disks[drive]); + return get_disk_and_module(disks[drive]); } static int __init do_floppy_init(void) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d5fe720cf149..87855b5123a6 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1922,7 +1922,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data) if (err < 0) kobj = NULL; else - kobj = get_disk(lo->lo_disk); + kobj = get_disk_and_module(lo->lo_disk); mutex_unlock(&loop_index_mutex); *part = 0; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 84434d3ea19b..64e066eba72e 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -799,7 +799,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data) return NULL; *part = 0; - return get_disk(swd->unit[drive].disk); + return get_disk_and_module(swd->unit[drive].disk); } static int swim_add_floppy(struct swim_priv *swd, enum drive_location location) diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 41c95c9b2ab4..8f9130ab5887 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -332,7 +332,7 @@ static const struct block_device_operations z2_fops = static struct kobject *z2_find(dev_t dev, int *part, void *data) { *part = 0; - return get_disk(z2ram_gendisk); + return get_disk_and_module(z2ram_gendisk); } static struct request_queue *z2_queue; diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 17fd55af4d92..caa20eb5f26b 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -928,7 +928,7 @@ static int exact_lock(dev_t dev, void *data) { struct gendisk *p = data; - if (!get_disk(p)) + if (!get_disk_and_module(p)) return -1; return 0; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5e3531027b51..8e11b9321e55 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -600,7 +600,7 @@ extern void delete_partition(struct gendisk *, int); extern void printk_all_partitions(void); extern struct gendisk *__alloc_disk_node(int minors, int node_id); -extern struct kobject *get_disk(struct gendisk *disk); +extern struct kobject *get_disk_and_module(struct gendisk *disk); extern void put_disk(struct gendisk *disk); extern void blk_register_region(dev_t devt, unsigned long range, struct module *module, -- cgit v1.2.3 From 9df6c29912315186fef1c79cc15b758ace84175b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 26 Feb 2018 13:01:39 +0100 Subject: genhd: Add helper put_disk_and_module() Add a proper counterpart to get_disk_and_module() - put_disk_and_module(). Currently it is opencoded in several places. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 11 ++--------- block/genhd.c | 20 ++++++++++++++++---- fs/block_dev.c | 19 +++++-------------- include/linux/genhd.h | 1 + 4 files changed, 24 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4117524ca45b..c2033a232a44 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -812,7 +812,6 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; - struct module *owner; unsigned int major, minor; int key_len, part, ret; char *body; @@ -904,9 +903,7 @@ fail_unlock: spin_unlock_irq(q->queue_lock); rcu_read_unlock(); fail: - owner = disk->fops->owner; - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -931,13 +928,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); void blkg_conf_finish(struct blkg_conf_ctx *ctx) __releases(ctx->disk->queue->queue_lock) __releases(rcu) { - struct module *owner; - spin_unlock_irq(ctx->disk->queue->queue_lock); rcu_read_unlock(); - owner = ctx->disk->fops->owner; - put_disk(ctx->disk); - module_put(owner); + put_disk_and_module(ctx->disk); } EXPORT_SYMBOL_GPL(blkg_conf_finish); diff --git a/block/genhd.c b/block/genhd.c index 21b2843b27d0..4c0590434591 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -817,10 +817,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) } if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) { - struct module *owner = disk->fops->owner; - - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); disk = NULL; } return disk; @@ -1483,6 +1480,21 @@ void put_disk(struct gendisk *disk) } EXPORT_SYMBOL(put_disk); +/* + * This is a counterpart of get_disk_and_module() and thus also of + * get_gendisk(). + */ +void put_disk_and_module(struct gendisk *disk) +{ + if (disk) { + struct module *owner = disk->fops->owner; + + put_disk(disk); + module_put(owner); + } +} +EXPORT_SYMBOL(put_disk_and_module); + static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; diff --git a/fs/block_dev.c b/fs/block_dev.c index 4a181fcb5175..1dbbf847911a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1111,8 +1111,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, else whole = bdgrab(bdev); - module_put(disk->fops->owner); - put_disk(disk); + put_disk_and_module(disk); if (!whole) return ERR_PTR(-ENOMEM); @@ -1407,7 +1406,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk; - struct module *owner; int ret; int partno; int perm = 0; @@ -1433,7 +1431,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk = get_gendisk(bdev->bd_dev, &partno); if (!disk) goto out; - owner = disk->fops->owner; disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); @@ -1463,8 +1460,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); goto restart; } } @@ -1525,8 +1521,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_unlock_bdev; } /* only one opener holds refs to the module and disk */ - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); } bdev->bd_openers++; if (for_part) @@ -1546,8 +1541,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); out: bdput(bdev); @@ -1770,8 +1764,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) disk->fops->release(disk, mode); } if (!bdev->bd_openers) { - struct module *owner = disk->fops->owner; - disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; @@ -1779,8 +1771,7 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) victim = bdev->bd_contains; bdev->bd_contains = NULL; - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 8e11b9321e55..7f5906fe1b70 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -602,6 +602,7 @@ extern void printk_all_partitions(void); extern struct gendisk *__alloc_disk_node(int minors, int node_id); extern struct kobject *get_disk_and_module(struct gendisk *disk); extern void put_disk(struct gendisk *disk); +extern void put_disk_and_module(struct gendisk *disk); extern void blk_register_region(dev_t devt, unsigned long range, struct module *module, struct kobject *(*probe)(dev_t, int *, void *), -- cgit v1.2.3 From 56c0908c855afbb2bdda17c15d2879949a091ad3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 26 Feb 2018 13:01:41 +0100 Subject: genhd: Fix BUG in blkdev_open() When two blkdev_open() calls for a partition race with device removal and recreation, we can hit BUG_ON(!bd_may_claim(bdev, whole, holder)) in blkdev_open(). The race can happen as follows: CPU0 CPU1 CPU2 del_gendisk() bdev_unhash_inode(part1); blkdev_open(part1, O_EXCL) blkdev_open(part1, O_EXCL) bdev = bd_acquire() bdev = bd_acquire() blkdev_get(bdev) bd_start_claiming(bdev) - finds old inode 'whole' bd_prepare_to_claim() -> 0 bdev_unhash_inode(whole); blkdev_get(bdev); bd_start_claiming(bdev) - finds new inode 'whole' bd_prepare_to_claim() - this also succeeds as we have different 'whole' here... - bad things happen now as we have two exclusive openers of the same bdev The problem here is that block device opens can see various intermediate states while gendisk is shutting down and then being recreated. We fix the problem by introducing new lookup_sem in gendisk that synchronizes gendisk deletion with get_gendisk() and furthermore by making sure that get_gendisk() does not return gendisk that is being (or has been) deleted. This makes sure that once we ever manage to look up newly created bdev inode, we are also guaranteed that following get_gendisk() will either return failure (and we fail open) or it returns gendisk for the new device and following bdget_disk() will return new bdev inode (i.e., blkdev_open() follows the path as if it is completely run after new device is created). Reported-and-analyzed-by: Hou Tao Tested-by: Hou Tao Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/genhd.c | 21 ++++++++++++++++++++- include/linux/genhd.h | 1 + 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/block/genhd.c b/block/genhd.c index 4c0590434591..9656f9e9f99e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -717,6 +717,11 @@ void del_gendisk(struct gendisk *disk) blk_integrity_del(disk); disk_del_events(disk); + /* + * Block lookups of the disk until all bdevs are unhashed and the + * disk is marked as dead (GENHD_FL_UP cleared). + */ + down_write(&disk->lookup_sem); /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); @@ -731,6 +736,7 @@ void del_gendisk(struct gendisk *disk) bdev_unhash_inode(disk_devt(disk)); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; + up_write(&disk->lookup_sem); if (!(disk->flags & GENHD_FL_HIDDEN)) sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); @@ -816,9 +822,21 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) spin_unlock_bh(&ext_devt_lock); } - if (disk && unlikely(disk->flags & GENHD_FL_HIDDEN)) { + if (!disk) + return NULL; + + /* + * Synchronize with del_gendisk() to not return disk that is being + * destroyed. + */ + down_read(&disk->lookup_sem); + if (unlikely((disk->flags & GENHD_FL_HIDDEN) || + !(disk->flags & GENHD_FL_UP))) { + up_read(&disk->lookup_sem); put_disk_and_module(disk); disk = NULL; + } else { + up_read(&disk->lookup_sem); } return disk; } @@ -1418,6 +1436,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) kfree(disk); return NULL; } + init_rwsem(&disk->lookup_sem); disk->node_id = node_id; if (disk_expand_part_tbl(disk, 0)) { free_part_stats(&disk->part0); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7f5906fe1b70..c826b0b5232a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -198,6 +198,7 @@ struct gendisk { void *private_data; int flags; + struct rw_semaphore lookup_sem; struct kobject *slave_dir; struct timer_rand_state *random; -- cgit v1.2.3 From 230f5a8969d8345fc9bbe3683f068246cf1be4b8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 21 Feb 2018 17:08:01 -0800 Subject: dax: fix vma_is_fsdax() helper Gerd reports that ->i_mode may contain other bits besides S_IFCHR. Use S_ISCHR() instead. Otherwise, get_user_pages_longterm() may fail on device-dax instances when those are meant to be explicitly allowed. Fixes: 2bb6d2837083 ("mm: introduce get_user_pages_longterm") Cc: Reported-by: Gerd Rausch Acked-by: Jane Chu Reported-by: Haozhong Zhang Reviewed-by: Jan Kara Signed-off-by: Dan Williams --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2a815560fda0..79c413985305 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3198,7 +3198,7 @@ static inline bool vma_is_fsdax(struct vm_area_struct *vma) if (!vma_is_dax(vma)) return false; inode = file_inode(vma->vm_file); - if (inode->i_mode == S_IFCHR) + if (S_ISCHR(inode->i_mode)) return false; /* device-dax */ return true; } -- cgit v1.2.3 From 9c2c2e62df3fa30fb13fbeb7512a4eede729383b Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 27 Feb 2018 01:56:06 +0100 Subject: net: phy: Restore phy_resume() locking assumption commit f5e64032a799 ("net: phy: fix resume handling") changes the locking semantics for phy_resume() such that the caller now needs to hold the phy mutex. Not all call sites were adopted to this new semantic, resulting in warnings from the added WARN_ON(!mutex_is_locked(&phydev->lock)). Rather than change the semantics, add a __phy_resume() and restore the old behavior of phy_resume(). Reported-by: Heiner Kallweit Fixes: f5e64032a799 ("net: phy: fix resume handling") Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 2 +- drivers/net/phy/phy_device.c | 18 +++++++++++++----- include/linux/phy.h | 1 + 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index e3e29c2b028b..a6f924fee584 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -819,7 +819,7 @@ void phy_start(struct phy_device *phydev) break; case PHY_HALTED: /* if phy was suspended, bring the physical link up again */ - phy_resume(phydev); + __phy_resume(phydev); /* make sure interrupts are re-enabled for the PHY */ if (phy_interrupt_is_valid(phydev)) { diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index d39ae77707ef..478405e544cc 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -135,9 +135,7 @@ static int mdio_bus_phy_resume(struct device *dev) if (!mdio_bus_phy_may_suspend(phydev)) goto no_resume; - mutex_lock(&phydev->lock); ret = phy_resume(phydev); - mutex_unlock(&phydev->lock); if (ret < 0) return ret; @@ -1041,9 +1039,7 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, if (err) goto error; - mutex_lock(&phydev->lock); phy_resume(phydev); - mutex_unlock(&phydev->lock); phy_led_triggers_register(phydev); return err; @@ -1172,7 +1168,7 @@ int phy_suspend(struct phy_device *phydev) } EXPORT_SYMBOL(phy_suspend); -int phy_resume(struct phy_device *phydev) +int __phy_resume(struct phy_device *phydev) { struct phy_driver *phydrv = to_phy_driver(phydev->mdio.dev.driver); int ret = 0; @@ -1189,6 +1185,18 @@ int phy_resume(struct phy_device *phydev) return ret; } +EXPORT_SYMBOL(__phy_resume); + +int phy_resume(struct phy_device *phydev) +{ + int ret; + + mutex_lock(&phydev->lock); + ret = __phy_resume(phydev); + mutex_unlock(&phydev->lock); + + return ret; +} EXPORT_SYMBOL(phy_resume); int phy_loopback(struct phy_device *phydev, bool enable) diff --git a/include/linux/phy.h b/include/linux/phy.h index 5a0c3e53e7c2..d7069539f351 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -924,6 +924,7 @@ void phy_device_remove(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); +int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, phy_interface_t interface); -- cgit v1.2.3 From 28b0f8a6962a24ed21737578f3b1b07424635c9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Feb 2018 07:38:08 -0800 Subject: tty: make n_tty_read() always abort if hangup is in progress A tty is hung up by __tty_hangup() setting file->f_op to hung_up_tty_fops, which is skipped on ttys whose write operation isn't tty_write(). This means that, for example, /dev/console whose write op is redirected_tty_write() is never actually marked hung up. Because n_tty_read() uses the hung up status to decide whether to abort the waiting readers, the lack of hung-up marking can lead to the following scenario. 1. A session contains two processes. The leader and its child. The child ignores SIGHUP. 2. The leader exits and starts disassociating from the controlling terminal (/dev/console). 3. __tty_hangup() skips setting f_op to hung_up_tty_fops. 4. SIGHUP is delivered and ignored. 5. tty_ldisc_hangup() is invoked. It wakes up the waits which should clear the read lockers of tty->ldisc_sem. 6. The reader wakes up but because tty_hung_up_p() is false, it doesn't abort and goes back to sleep while read-holding tty->ldisc_sem. 7. The leader progresses to tty_ldisc_lock() in tty_ldisc_hangup() and is now stuck in D sleep indefinitely waiting for tty->ldisc_sem. The following is Alan's explanation on why some ttys aren't hung up. http://lkml.kernel.org/r/20171101170908.6ad08580@alans-desktop 1. It broke the serial consoles because they would hang up and close down the hardware. With tty_port that *should* be fixable properly for any cases remaining. 2. The console layer was (and still is) completely broken and doens't refcount properly. So if you turn on console hangups it breaks (as indeed does freeing consoles and half a dozen other things). As neither can be fixed quickly, this patch works around the problem by introducing a new flag, TTY_HUPPING, which is used solely to tell n_tty_read() that hang-up is in progress for the console and the readers should be aborted regardless of the hung-up status of the device. The following is a sample hung task warning caused by this issue. INFO: task agetty:2662 blocked for more than 120 seconds. Not tainted 4.11.3-dbg-tty-lockup-02478-gfd6c7ee-dirty #28 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. 0 2662 1 0x00000086 Call Trace: __schedule+0x267/0x890 schedule+0x36/0x80 schedule_timeout+0x23c/0x2e0 ldsem_down_write+0xce/0x1f6 tty_ldisc_lock+0x16/0x30 tty_ldisc_hangup+0xb3/0x1b0 __tty_hangup+0x300/0x410 disassociate_ctty+0x6c/0x290 do_exit+0x7ef/0xb00 do_group_exit+0x3f/0xa0 get_signal+0x1b3/0x5d0 do_signal+0x28/0x660 exit_to_usermode_loop+0x46/0x86 do_syscall_64+0x9c/0xb0 entry_SYSCALL64_slow_path+0x25/0x25 The following is the repro. Run "$PROG /dev/console". The parent process hangs in D state. #include #include #include #include #include #include #include #include #include #include #include #include int main(int argc, char **argv) { struct sigaction sact = { .sa_handler = SIG_IGN }; struct timespec ts1s = { .tv_sec = 1 }; pid_t pid; int fd; if (argc < 2) { fprintf(stderr, "test-hung-tty /dev/$TTY\n"); return 1; } /* fork a child to ensure that it isn't already the session leader */ pid = fork(); if (pid < 0) { perror("fork"); return 1; } if (pid > 0) { /* top parent, wait for everyone */ while (waitpid(-1, NULL, 0) >= 0) ; if (errno != ECHILD) perror("waitpid"); return 0; } /* new session, start a new session and set the controlling tty */ if (setsid() < 0) { perror("setsid"); return 1; } fd = open(argv[1], O_RDWR); if (fd < 0) { perror("open"); return 1; } if (ioctl(fd, TIOCSCTTY, 1) < 0) { perror("ioctl"); return 1; } /* fork a child, sleep a bit and exit */ pid = fork(); if (pid < 0) { perror("fork"); return 1; } if (pid > 0) { nanosleep(&ts1s, NULL); printf("Session leader exiting\n"); exit(0); } /* * The child ignores SIGHUP and keeps reading from the controlling * tty. Because SIGHUP is ignored, the child doesn't get killed on * parent exit and the bug in n_tty makes the read(2) block the * parent's control terminal hangup attempt. The parent ends up in * D sleep until the child is explicitly killed. */ sigaction(SIGHUP, &sact, NULL); printf("Child reading tty\n"); while (1) { char buf[1024]; if (read(fd, buf, sizeof(buf)) < 0) { perror("read"); return 1; } } return 0; } Signed-off-by: Tejun Heo Cc: Alan Cox Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/n_tty.c | 6 ++++++ drivers/tty/tty_io.c | 9 +++++++++ include/linux/tty.h | 1 + 3 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 5c0e59e8fe46..cbe98bc2b998 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -2180,6 +2180,12 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct file *file, } if (tty_hung_up_p(file)) break; + /* + * Abort readers for ttys which never actually + * get hung up. See __tty_hangup(). + */ + if (test_bit(TTY_HUPPING, &tty->flags)) + break; if (!timeout) break; if (file->f_flags & O_NONBLOCK) { diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index eb9133b472f4..63114ea35ec1 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -586,6 +586,14 @@ static void __tty_hangup(struct tty_struct *tty, int exit_session) return; } + /* + * Some console devices aren't actually hung up for technical and + * historical reasons, which can lead to indefinite interruptible + * sleep in n_tty_read(). The following explicitly tells + * n_tty_read() to abort readers. + */ + set_bit(TTY_HUPPING, &tty->flags); + /* inuse_filps is protected by the single tty lock, this really needs to change if we want to flush the workqueue with the lock held */ @@ -640,6 +648,7 @@ static void __tty_hangup(struct tty_struct *tty, int exit_session) * from the ldisc side, which is now guaranteed. */ set_bit(TTY_HUPPED, &tty->flags); + clear_bit(TTY_HUPPING, &tty->flags); tty_unlock(tty); if (f) diff --git a/include/linux/tty.h b/include/linux/tty.h index 0a6c71e0ad01..47f8af22f216 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -364,6 +364,7 @@ struct tty_file_private { #define TTY_PTY_LOCK 16 /* pty private */ #define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */ #define TTY_HUPPED 18 /* Post driver->hangup() */ +#define TTY_HUPPING 19 /* Hangup in progress */ #define TTY_LDISC_HALTED 22 /* Line discipline is halted */ /* Values for tty->flow_change */ -- cgit v1.2.3 From 9c0fb1e313aaf4e8edec22433c8b22dd308e466c Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Tue, 27 Feb 2018 20:10:18 +0800 Subject: block: display the correct diskname for bio bio_devname use __bdevname to display the device name, and can only show the major and minor of the part0, Fix this by using disk_name to display the correct name. Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and partitions index") Reviewed-by: Omar Sandoval Reviewed-by: Christoph Hellwig Signed-off-by: Jiufei Xue Signed-off-by: Jens Axboe --- block/partition-generic.c | 6 ++++++ include/linux/bio.h | 4 +--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/partition-generic.c b/block/partition-generic.c index 91622db9aedf..08dabcd8b6ae 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -51,6 +51,12 @@ const char *bdevname(struct block_device *bdev, char *buf) EXPORT_SYMBOL(bdevname); +const char *bio_devname(struct bio *bio, char *buf) +{ + return disk_name(bio->bi_disk, bio->bi_partno, buf); +} +EXPORT_SYMBOL(bio_devname); + /* * There's very little reason to use this, you should really * have a struct block_device just about everywhere and use diff --git a/include/linux/bio.h b/include/linux/bio.h index d0eb659fa733..ce547a25e8ae 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -511,6 +511,7 @@ void zero_fill_bio(struct bio *bio); extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); +extern const char *bio_devname(struct bio *bio, char *buffer); #define bio_set_dev(bio, bdev) \ do { \ @@ -529,9 +530,6 @@ do { \ #define bio_dev(bio) \ disk_devt((bio)->bi_disk) -#define bio_devname(bio, buf) \ - __bdevname(bio_dev(bio), (buf)) - #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); void bio_disassociate_task(struct bio *bio); -- cgit v1.2.3 From fde9fc766e96c494b82931b1d270a9a751be07c0 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Mon, 19 Feb 2018 16:55:06 +0000 Subject: signals: Move put_compat_sigset to compat.h to silence hardened usercopy Since commit afcc90f8621e ("usercopy: WARN() on slab cache usercopy region violations"), MIPS systems booting with a compat root filesystem emit a warning when copying compat siginfo to userspace: WARNING: CPU: 0 PID: 953 at mm/usercopy.c:81 usercopy_warn+0x98/0xe8 Bad or missing usercopy whitelist? Kernel memory exposure attempt detected from SLAB object 'task_struct' (offset 1432, size 16)! Modules linked in: CPU: 0 PID: 953 Comm: S01logging Not tainted 4.16.0-rc2 #10 Stack : ffffffff808c0000 0000000000000000 0000000000000001 65ac85163f3bdc4a 65ac85163f3bdc4a 0000000000000000 90000000ff667ab8 ffffffff808c0000 00000000000003f8 ffffffff808d0000 00000000000000d1 0000000000000000 000000000000003c 0000000000000000 ffffffff808c8ca8 ffffffff808d0000 ffffffff808d0000 ffffffff80810000 fffffc0000000000 ffffffff80785c30 0000000000000009 0000000000000051 90000000ff667eb0 90000000ff667db0 000000007fe0d938 0000000000000018 ffffffff80449958 0000000020052798 ffffffff808c0000 90000000ff664000 90000000ff667ab0 00000000100c0000 ffffffff80698810 0000000000000000 0000000000000000 0000000000000000 0000000000000000 0000000000000000 ffffffff8010d02c 65ac85163f3bdc4a ... Call Trace: [] show_stack+0x9c/0x130 [] dump_stack+0x90/0xd0 [] __warn+0x100/0x118 [] warn_slowpath_fmt+0x4c/0x70 [] usercopy_warn+0x98/0xe8 [] __check_object_size+0xfc/0x250 [] put_compat_sigset+0x30/0x88 [] setup_rt_frame_n32+0xc4/0x160 [] do_signal+0x19c/0x230 [] do_notify_resume+0x60/0x78 [] work_notifysig+0x10/0x18 ---[ end trace 88fffbf69147f48a ]--- Commit 5905429ad856 ("fork: Provide usercopy whitelisting for task_struct") noted that: "While the blocked and saved_sigmask fields of task_struct are copied to userspace (via sigmask_to_save() and setup_rt_frame()), it is always copied with a static length (i.e. sizeof(sigset_t))." However, this is not true in the case of compat signals, whose sigset is copied by put_compat_sigset and receives size as an argument. At most call sites, put_compat_sigset is copying a sigset from the current task_struct. This triggers a warning when CONFIG_HARDENED_USERCOPY is active. However, by marking this function as static inline, the warning can be avoided because in all of these cases the size is constant at compile time, which is allowed. The only site where this is not the case is handling the rt_sigpending syscall, but there the copy is being made from a stack local variable so does not trigger the warning. Move put_compat_sigset to compat.h, and mark it static inline. This fixes the WARN on MIPS. Fixes: afcc90f8621e ("usercopy: WARN() on slab cache usercopy region violations") Signed-off-by: Matt Redfearn Acked-by: Kees Cook Cc: "Dmitry V . Levin" Cc: Al Viro Cc: kernel-hardening@lists.openwall.com Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/18639/ Signed-off-by: James Hogan --- include/linux/compat.h | 26 ++++++++++++++++++++++++-- kernel/compat.c | 19 ------------------- 2 files changed, 24 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 8a9643857c4a..c4139c7a0de0 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -17,6 +17,7 @@ #include #include #include /* for aio_context_t */ +#include #include #include @@ -550,8 +551,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp); extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat); -extern int put_compat_sigset(compat_sigset_t __user *compat, - const sigset_t *set, unsigned int size); + +/* + * Defined inline such that size can be compile time constant, which avoids + * CONFIG_HARDENED_USERCOPY complaining about copies from task_struct + */ +static inline int +put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, + unsigned int size) +{ + /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ +#ifdef __BIG_ENDIAN + compat_sigset_t v; + switch (_NSIG_WORDS) { + case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; + case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; + case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; + case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; + } + return copy_to_user(compat, &v, size) ? -EFAULT : 0; +#else + return copy_to_user(compat, set, size) ? -EFAULT : 0; +#endif +} asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes, diff --git a/kernel/compat.c b/kernel/compat.c index 3247fe761f60..3f5fa8902e7d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -488,25 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) } EXPORT_SYMBOL_GPL(get_compat_sigset); -int -put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, - unsigned int size) -{ - /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ -#ifdef __BIG_ENDIAN - compat_sigset_t v; - switch (_NSIG_WORDS) { - case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; - case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; - case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; - case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; - } - return copy_to_user(compat, &v, size) ? -EFAULT : 0; -#else - return copy_to_user(compat, set, size) ? -EFAULT : 0; -#endif -} - #ifdef CONFIG_NUMA COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, compat_uptr_t __user *, pages32, -- cgit v1.2.3 From 779b7931b27bfa80bac46d0115d229259aef580b Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Thu, 1 Mar 2018 17:13:37 +1100 Subject: net: rename skb_gso_validate_mtu -> skb_gso_validate_network_len If you take a GSO skb, and split it into packets, will the network length (L3 headers + L4 headers + payload) of those packets be small enough to fit within a given MTU? skb_gso_validate_mtu gives you the answer to that question. However, we recently added to add a way to validate the MAC length of a split GSO skb (L2+L3+L4+payload), and the names get confusing, so rename skb_gso_validate_mtu to skb_gso_validate_network_len Signed-off-by: Daniel Axtens Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 11 ++++++----- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv4/netfilter/nf_flow_table_ipv4.c | 2 +- net/ipv6/ip6_output.c | 2 +- net/ipv6/netfilter/nf_flow_table_ipv6.c | 2 +- net/mpls/af_mpls.c | 2 +- net/xfrm/xfrm_device.c | 2 +- 9 files changed, 14 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c1e66bdcf583..a057dd1a75c7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3286,7 +3286,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); unsigned int skb_gso_transport_seglen(const struct sk_buff *skb); -bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu); +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 09bd89c90a71..b63767008824 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4955,19 +4955,20 @@ static inline bool skb_gso_size_check(const struct sk_buff *skb, } /** - * skb_gso_validate_mtu - Return in case such skb fits a given MTU + * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? * * @skb: GSO skb * @mtu: MTU to validate against * - * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU - * once split. + * skb_gso_validate_network_len validates if a given skb will fit a + * wanted MTU once split. It considers L3 headers, L4 headers, and the + * payload. */ -bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu) +bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) { return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); } -EXPORT_SYMBOL_GPL(skb_gso_validate_mtu); +EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); /** * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 2dd21c3281a1..b54b948b0596 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -55,7 +55,7 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) if (skb->ignore_df) return false; - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e8e675be60ec..66340ab750e6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -248,7 +248,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, /* common case: seglen is <= mtu */ - if (skb_gso_validate_mtu(skb, mtu)) + if (skb_gso_validate_network_len(skb, mtu)) return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length exceeds the egress MTU. diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c index 282b9cc4fe82..0cd46bffa469 100644 --- a/net/ipv4/netfilter/nf_flow_table_ipv4.c +++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c @@ -186,7 +186,7 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) return false; - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 997c7f19ad62..a8a919520090 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -412,7 +412,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) if (skb->ignore_df) return false; - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c index d346705d6ee6..207cb35569b1 100644 --- a/net/ipv6/netfilter/nf_flow_table_ipv6.c +++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c @@ -178,7 +178,7 @@ static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) if (skb->len <= mtu) return false; - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index e545a3c9365f..7a4de6d618b1 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -122,7 +122,7 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) if (skb->len <= mtu) return false; - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 8e70291e586a..e87d6c4dd5b6 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -217,7 +217,7 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) if (skb->len <= mtu) goto ok; - if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) goto ok; } -- cgit v1.2.3 From a4a77718ee4053a44aa40fe67247c1afb5ce2f1e Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Thu, 1 Mar 2018 17:13:40 +1100 Subject: net: make skb_gso_*_seglen functions private They're very hard to use properly as they do not consider the GSO_BY_FRAGS case. Code should use skb_gso_validate_network_len and skb_gso_validate_mac_len as they do consider this case. Make the seglen functions static, which stops people using them outside of skbuff.c Signed-off-by: Daniel Axtens Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/linux/skbuff.h | 33 --------------------------------- net/core/skbuff.c | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a057dd1a75c7..ddf77cf4ff2d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3285,7 +3285,6 @@ int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); -unsigned int skb_gso_transport_seglen(const struct sk_buff *skb); bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); @@ -4104,38 +4103,6 @@ static inline bool skb_head_is_locked(const struct sk_buff *skb) return !skb->head_frag || skb_cloned(skb); } -/** - * skb_gso_network_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_network_seglen is used to determine the real size of the - * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). - * - * The MAC/L2 header is not accounted for. - */ -static inline unsigned int skb_gso_network_seglen(const struct sk_buff *skb) -{ - unsigned int hdr_len = skb_transport_header(skb) - - skb_network_header(skb); - return hdr_len + skb_gso_transport_seglen(skb); -} - -/** - * skb_gso_mac_seglen - Return length of individual segments of a gso packet - * - * @skb: GSO skb - * - * skb_gso_mac_seglen is used to determine the real size of the - * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 - * headers (TCP/UDP). - */ -static inline unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) -{ - unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); - return hdr_len + skb_gso_transport_seglen(skb); -} - /* Local Checksum Offload. * Compute outer checksum based on the assumption that the * inner checksum will be offloaded later. diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b63767008824..0bb0d8877954 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4891,7 +4891,7 @@ EXPORT_SYMBOL_GPL(skb_scrub_packet); * * The MAC/L2 or network (IP, IPv6) headers are not accounted for. */ -unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) +static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int thlen = 0; @@ -4913,7 +4913,40 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) */ return thlen + shinfo->gso_size; } -EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); + +/** + * skb_gso_network_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_network_seglen is used to determine the real size of the + * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). + * + * The MAC/L2 header is not accounted for. + */ +static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - + skb_network_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} + +/** + * skb_gso_mac_seglen - Return length of individual segments of a gso packet + * + * @skb: GSO skb + * + * skb_gso_mac_seglen is used to determine the real size of the + * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 + * headers (TCP/UDP). + */ +static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) +{ + unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); + + return hdr_len + skb_gso_transport_seglen(skb); +} /** * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS -- cgit v1.2.3 From a6f1086e29e93621a6394b94b8c0e4a4e490f38b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Mon, 5 Mar 2018 15:22:30 -0800 Subject: PCI: Move of_irq_parse_and_map_pci() declaration under OF_IRQ Since commit 4670d610d592 ("PCI: Move OF-related PCI functions into PCI core"), sparc:allmodconfig fails to build with the following error. pcie-cadence-host.c:(.text+0x4c4): undefined reference to `of_irq_parse_and_map_pci' pcie-cadence-host.c:(.text+0x4c8): undefined reference to `of_irq_parse_and_map_pci' of_irq_parse_and_map_pci() is now only available if OF_IRQ is enabled. Make its declaration and its dummy function dependent on OF_IRQ to solve the problem. Fixes: 4670d610d592 ("PCI: Move OF-related PCI functions into PCI core") Signed-off-by: Guenter Roeck Signed-off-by: Bjorn Helgaas Acked-by: Rob Herring --- include/linux/of_pci.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h index 88865e0ebf4d..091033a6b836 100644 --- a/include/linux/of_pci.h +++ b/include/linux/of_pci.h @@ -13,7 +13,6 @@ struct device_node; struct device_node *of_pci_find_child_device(struct device_node *parent, unsigned int devfn); int of_pci_get_devfn(struct device_node *np); -int of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin); int of_pci_parse_bus_range(struct device_node *node, struct resource *res); int of_get_pci_domain_nr(struct device_node *node); int of_pci_get_max_link_speed(struct device_node *node); @@ -33,12 +32,6 @@ static inline int of_pci_get_devfn(struct device_node *np) return -EINVAL; } -static inline int -of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin) -{ - return 0; -} - static inline int of_pci_parse_bus_range(struct device_node *node, struct resource *res) { @@ -67,6 +60,16 @@ of_pci_get_max_link_speed(struct device_node *node) static inline void of_pci_check_probe_only(void) { } #endif +#if IS_ENABLED(CONFIG_OF_IRQ) +int of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin); +#else +static inline int +of_irq_parse_and_map_pci(const struct pci_dev *dev, u8 slot, u8 pin) +{ + return 0; +} +#endif + #if defined(CONFIG_OF_ADDRESS) int of_pci_get_host_bridge_resources(struct device_node *dev, unsigned char busno, unsigned char bus_max, -- cgit v1.2.3 From 859d880cf544dbe095ce97534ef04cd88ba2f2b4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 6 Mar 2018 00:20:25 -0600 Subject: signal: Correct the offset of si_pkey in struct siginfo The change moving addr_lsb into the _sigfault union failed to take into account that _sigfault._addr_bnd._lower being a pointer forced the entire union to have pointer alignment. In practice this only mattered for the offset of si_pkey which is why this has taken so long to discover. To correct this change _dummy_pkey and _dummy_bnd to have pointer type. Reported-by: kernel test robot Fixes: b68a68d3dcc1 ("signal: Move addr_lsb into the _sigfault union for clarity") Signed-off-by: "Eric W. Biederman" --- include/linux/compat.h | 4 ++-- include/uapi/asm-generic/siginfo.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 8a9643857c4a..e16d07eb08cf 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -229,13 +229,13 @@ typedef struct compat_siginfo { short int _addr_lsb; /* Valid LSB of the reported address. */ /* used when si_code=SEGV_BNDERR */ struct { - short _dummy_bnd; + compat_uptr_t _dummy_bnd; compat_uptr_t _lower; compat_uptr_t _upper; } _addr_bnd; /* used when si_code=SEGV_PKUERR */ struct { - short _dummy_pkey; + compat_uptr_t _dummy_pkey; u32 _pkey; } _addr_pkey; }; diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index 85dc965afd89..99c902e460c2 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -102,13 +102,13 @@ typedef struct siginfo { short _addr_lsb; /* LSB of the reported address */ /* used when si_code=SEGV_BNDERR */ struct { - short _dummy_bnd; + void *_dummy_bnd; void __user *_lower; void __user *_upper; } _addr_bnd; /* used when si_code=SEGV_PKUERR */ struct { - short _dummy_pkey; + void *_dummy_pkey; __u32 _pkey; } _addr_pkey; }; -- cgit v1.2.3 From cb88a0588717ba6c756cb5972d75766b273a6817 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 6 Mar 2018 09:38:49 +0100 Subject: usb: quirks: add control message delay for 1b1c:1b20 Corsair Strafe RGB keyboard does not respond to usb control messages sometimes and hence generates timeouts. Commit de3af5bf259d ("usb: quirks: add delay init quirk for Corsair Strafe RGB keyboard") tried to fix those timeouts by adding USB_QUIRK_DELAY_INIT. Unfortunately, even with this quirk timeouts of usb_control_msg() can still be seen, but with a lower frequency (approx. 1 out of 15): [ 29.103520] usb 1-8: string descriptor 0 read error: -110 [ 34.363097] usb 1-8: can't set config #1, error -110 Adding further delays to different locations where usb control messages are issued just moves the timeouts to other locations, e.g.: [ 35.400533] usbhid 1-8:1.0: can't add hid device: -110 [ 35.401014] usbhid: probe of 1-8:1.0 failed with error -110 The only way to reliably avoid those issues is having a pause after each usb control message. In approx. 200 boot cycles no more timeouts were seen. Addionaly, keep USB_QUIRK_DELAY_INIT as it turned out to be necessary to have the delay in hub_port_connect() after hub_port_init(). The overall boot time seems not to be influenced by these additional delays, even on fast machines and lightweight distributions. Fixes: de3af5bf259d ("usb: quirks: add delay init quirk for Corsair Strafe RGB keyboard") Cc: stable@vger.kernel.org Signed-off-by: Danilo Krummrich Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/message.c | 4 ++++ drivers/usb/core/quirks.c | 3 ++- include/linux/usb/quirks.h | 3 +++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index c64cf6c4a83d..0c11d40a12bc 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -151,6 +151,10 @@ int usb_control_msg(struct usb_device *dev, unsigned int pipe, __u8 request, ret = usb_internal_control_msg(dev, pipe, dr, data, size, timeout); + /* Linger a bit, prior to the next control message. */ + if (dev->quirks & USB_QUIRK_DELAY_CTRL_MSG) + msleep(200); + kfree(dr); return ret; diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index f4a548471f0f..54b019e267c5 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -230,7 +230,8 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x1b1c, 0x1b13), .driver_info = USB_QUIRK_DELAY_INIT }, /* Corsair Strafe RGB */ - { USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT }, + { USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT | + USB_QUIRK_DELAY_CTRL_MSG }, /* Corsair K70 LUX */ { USB_DEVICE(0x1b1c, 0x1b36), .driver_info = USB_QUIRK_DELAY_INIT }, diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index f1fcec2fd5f8..b7a99ce56bc9 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -63,4 +63,7 @@ */ #define USB_QUIRK_DISCONNECT_SUSPEND BIT(12) +/* Device needs a pause after every control message. */ +#define USB_QUIRK_DELAY_CTRL_MSG BIT(13) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From 16ca6a607d84bef0129698d8d808f501afd08d43 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Mar 2018 21:48:01 +0000 Subject: KVM: arm/arm64: vgic: Don't populate multiple LRs with the same vintid The vgic code is trying to be clever when injecting GICv2 SGIs, and will happily populate LRs with the same interrupt number if they come from multiple vcpus (after all, they are distinct interrupt sources). Unfortunately, this is against the letter of the architecture, and the GICv2 architecture spec says "Each valid interrupt stored in the List registers must have a unique VirtualID for that virtual CPU interface.". GICv3 has similar (although slightly ambiguous) restrictions. This results in guests locking up when using GICv2-on-GICv3, for example. The obvious fix is to stop trying so hard, and inject a single vcpu per SGI per guest entry. After all, pending SGIs with multiple source vcpus are pretty rare, and are mostly seen in scenario where the physical CPUs are severely overcomitted. But as we now only inject a single instance of a multi-source SGI per vcpu entry, we may delay those interrupts for longer than strictly necessary, and run the risk of injecting lower priority interrupts in the meantime. In order to address this, we adopt a three stage strategy: - If we encounter a multi-source SGI in the AP list while computing its depth, we force the list to be sorted - When populating the LRs, we prevent the injection of any interrupt of lower priority than that of the first multi-source SGI we've injected. - Finally, the injection of a multi-source SGI triggers the request of a maintenance interrupt when there will be no pending interrupt in the LRs (HCR_NPIE). At the point where the last pending interrupt in the LRs switches from Pending to Active, the maintenance interrupt will be delivered, allowing us to add the remaining SGIs using the same process. Cc: stable@vger.kernel.org Fixes: 0919e84c0fc1 ("KVM: arm/arm64: vgic-new: Add IRQ sync/flush framework") Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/linux/irqchip/arm-gic-v3.h | 1 + include/linux/irqchip/arm-gic.h | 1 + virt/kvm/arm/vgic/vgic-v2.c | 9 +++++- virt/kvm/arm/vgic/vgic-v3.c | 9 +++++- virt/kvm/arm/vgic/vgic.c | 61 +++++++++++++++++++++++++++++--------- virt/kvm/arm/vgic/vgic.h | 2 ++ 6 files changed, 67 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index c00c4c33e432..b26eccc78fb1 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -503,6 +503,7 @@ #define ICH_HCR_EN (1 << 0) #define ICH_HCR_UIE (1 << 1) +#define ICH_HCR_NPIE (1 << 3) #define ICH_HCR_TC (1 << 10) #define ICH_HCR_TALL0 (1 << 11) #define ICH_HCR_TALL1 (1 << 12) diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index d3453ee072fc..68d8b1f73682 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -84,6 +84,7 @@ #define GICH_HCR_EN (1 << 0) #define GICH_HCR_UIE (1 << 1) +#define GICH_HCR_NPIE (1 << 3) #define GICH_LR_VIRTUALID (0x3ff << 0) #define GICH_LR_PHYSID_CPUID_SHIFT (10) diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index e9d840a75e7b..29556f71b691 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -37,6 +37,13 @@ void vgic_v2_init_lrs(void) vgic_v2_write_lr(i, 0); } +void vgic_v2_set_npie(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; + + cpuif->vgic_hcr |= GICH_HCR_NPIE; +} + void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; @@ -64,7 +71,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) int lr; unsigned long flags; - cpuif->vgic_hcr &= ~GICH_HCR_UIE; + cpuif->vgic_hcr &= ~(GICH_HCR_UIE | GICH_HCR_NPIE); for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { u32 val = cpuif->vgic_lr[lr]; diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 6b329414e57a..0ff2006f3781 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -26,6 +26,13 @@ static bool group1_trap; static bool common_trap; static bool gicv4_enable; +void vgic_v3_set_npie(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; + + cpuif->vgic_hcr |= ICH_HCR_NPIE; +} + void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; @@ -47,7 +54,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) int lr; unsigned long flags; - cpuif->vgic_hcr &= ~ICH_HCR_UIE; + cpuif->vgic_hcr &= ~(ICH_HCR_UIE | ICH_HCR_NPIE); for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { u64 val = cpuif->vgic_lr[lr]; diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 0001858a2c23..8201899126f6 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -710,22 +710,37 @@ static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) vgic_v3_set_underflow(vcpu); } +static inline void vgic_set_npie(struct kvm_vcpu *vcpu) +{ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_set_npie(vcpu); + else + vgic_v3_set_npie(vcpu); +} + /* Requires the ap_list_lock to be held. */ -static int compute_ap_list_depth(struct kvm_vcpu *vcpu) +static int compute_ap_list_depth(struct kvm_vcpu *vcpu, + bool *multi_sgi) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq; int count = 0; + *multi_sgi = false; + DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { spin_lock(&irq->irq_lock); /* GICv2 SGIs can count for more than one... */ - if (vgic_irq_is_sgi(irq->intid) && irq->source) - count += hweight8(irq->source); - else + if (vgic_irq_is_sgi(irq->intid) && irq->source) { + int w = hweight8(irq->source); + + count += w; + *multi_sgi |= (w > 1); + } else { count++; + } spin_unlock(&irq->irq_lock); } return count; @@ -736,28 +751,43 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq; - int count = 0; + int count; + bool npie = false; + bool multi_sgi; + u8 prio = 0xff; DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); - if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr) + count = compute_ap_list_depth(vcpu, &multi_sgi); + if (count > kvm_vgic_global_state.nr_lr || multi_sgi) vgic_sort_ap_list(vcpu); + count = 0; + list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { spin_lock(&irq->irq_lock); - if (unlikely(vgic_target_oracle(irq) != vcpu)) - goto next; - /* - * If we get an SGI with multiple sources, try to get - * them in all at once. + * If we have multi-SGIs in the pipeline, we need to + * guarantee that they are all seen before any IRQ of + * lower priority. In that case, we need to filter out + * these interrupts by exiting early. This is easy as + * the AP list has been sorted already. */ - do { + if (multi_sgi && irq->priority > prio) { + spin_unlock(&irq->irq_lock); + break; + } + + if (likely(vgic_target_oracle(irq) == vcpu)) { vgic_populate_lr(vcpu, irq, count++); - } while (irq->source && count < kvm_vgic_global_state.nr_lr); -next: + if (irq->source) { + npie = true; + prio = irq->priority; + } + } + spin_unlock(&irq->irq_lock); if (count == kvm_vgic_global_state.nr_lr) { @@ -768,6 +798,9 @@ next: } } + if (npie) + vgic_set_npie(vcpu); + vcpu->arch.vgic_cpu.used_lrs = count; /* Nuke remaining LRs */ diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 5b11859a1a1e..f5b8519e5546 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -160,6 +160,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v2_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v2_set_npie(struct kvm_vcpu *vcpu); int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr); int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); @@ -189,6 +190,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); void vgic_v3_set_underflow(struct kvm_vcpu *vcpu); +void vgic_v3_set_npie(struct kvm_vcpu *vcpu); void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_v3_enable(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 95dd77580ccd66a0da96e6d4696945b8cea39431 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 14 Mar 2018 18:20:29 -0500 Subject: fs: Teach path_connected to handle nfs filesystems with multiple roots. On nfsv2 and nfsv3 the nfs server can export subsets of the same filesystem and report the same filesystem identifier, so that the nfs client can know they are the same filesystem. The subsets can be from disjoint directory trees. The nfsv2 and nfsv3 filesystems provides no way to find the common root of all directory trees exported form the server with the same filesystem identifier. The practical result is that in struct super s_root for nfs s_root is not necessarily the root of the filesystem. The nfs mount code sets s_root to the root of the first subset of the nfs filesystem that the kernel mounts. This effects the dcache invalidation code in generic_shutdown_super currently called shrunk_dcache_for_umount and that code for years has gone through an additional list of dentries that might be dentry trees that need to be freed to accomodate nfs. When I wrote path_connected I did not realize nfs was so special, and it's hueristic for avoiding calling is_subdir can fail. The practical case where this fails is when there is a move of a directory from the subtree exposed by one nfs mount to the subtree exposed by another nfs mount. This move can happen either locally or remotely. With the remote case requiring that the move directory be cached before the move and that after the move someone walks the path to where the move directory now exists and in so doing causes the already cached directory to be moved in the dcache through the magic of d_splice_alias. If someone whose working directory is in the move directory or a subdirectory and now starts calling .. from the initial mount of nfs (where s_root == mnt_root), then path_connected as a heuristic will not bother with the is_subdir check. As s_root really is not the root of the nfs filesystem this heuristic is wrong, and the path may actually not be connected and path_connected can fail. The is_subdir function might be cheap enough that we can call it unconditionally. Verifying that will take some benchmarking and the result may not be the same on all kernels this fix needs to be backported to. So I am avoiding that for now. Filesystems with snapshots such as nilfs and btrfs do something similar. But as the directory tree of the snapshots are disjoint from one another and from the main directory tree rename won't move things between them and this problem will not occur. Cc: stable@vger.kernel.org Reported-by: Al Viro Fixes: 397d425dc26d ("vfs: Test for and handle paths that are unreachable from their mnt_root") Signed-off-by: "Eric W. Biederman" Signed-off-by: Al Viro --- fs/namei.c | 5 +++-- fs/nfs/super.c | 2 ++ include/linux/fs.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 921ae32dbc80..cafa365eeb70 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -559,9 +559,10 @@ static int __nd_alloc_stack(struct nameidata *nd) static bool path_connected(const struct path *path) { struct vfsmount *mnt = path->mnt; + struct super_block *sb = mnt->mnt_sb; - /* Only bind mounts can have disconnected paths */ - if (mnt->mnt_root == mnt->mnt_sb->s_root) + /* Bind mounts and multi-root filesystems can have disconnected paths */ + if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) return true; return is_subdir(path->dentry, mnt->mnt_root); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 29bacdc56f6a..5e470e233c83 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2631,6 +2631,8 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, /* initial superblock/root creation */ mount_info->fill_super(s, mount_info); nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); + if (!(server->flags & NFS_MOUNT_UNSHARED)) + s->s_iflags |= SB_I_MULTIROOT; } mntroot = nfs_get_root(s, mount_info->mntfh, dev_name); diff --git a/include/linux/fs.h b/include/linux/fs.h index 2a815560fda0..0430e03febaa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1317,6 +1317,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ +#define SB_I_MULTIROOT 0x00000008 /* Multiple roots to the dentry tree */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ -- cgit v1.2.3