From f0e6d220a7cd93afa0260ac5e7849f00b05e035a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Aug 2013 16:07:13 +0100 Subject: KEYS: Load *.x509 files into kernel keyring Load all the files matching the pattern "*.x509" that are to be found in kernel base source dir and base build dir into the module signing keyring. The "extra_certificates" file is then redundant. Signed-off-by: David Howells --- kernel/Makefile | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index 1ce47553fb02..c34e5f993a21 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -142,17 +142,40 @@ $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE $(call if_changed,bc) ifeq ($(CONFIG_MODULE_SIG),y) +############################################################################### # -# Pull the signing certificate and any extra certificates into the kernel +# Roll all the X.509 certificates that we can find together and pull +# them into the kernel. # +############################################################################### +X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) +X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 +X509_CERTIFICATES := $(sort $(X509_CERTIFICATES-y)) + +ifeq ($(X509_CERTIFICATES),) +$(warning *** No X.509 certificates found ***) +endif + +ifneq ($(wildcard $(obj)/.x509.list),) +ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) +$(info X.509 certificate list changed) +$(shell rm $(obj)/.x509.list) +endif +endif + +kernel/modsign_certificate.o: $(obj)/x509_certificate_list -quiet_cmd_touch = TOUCH $@ - cmd_touch = touch $@ +quiet_cmd_x509certs = CERTS $@ + cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ +targets += $(obj)/x509_certificate_list +$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list + $(call if_changed,x509certs) -extra_certificates: - $(call cmd,touch) +targets += $(obj)/.x509.list +$(obj)/.x509.list: + @echo $(X509_CERTIFICATES) >$@ -kernel/modsign_certificate.o: signing_key.x509 extra_certificates +clean-files := x509_certificate_list .x509.list ############################################################################### # -- cgit v1.3 From 0fbd39cf7ffe3b6a787b66b672d21b84e4675352 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Aug 2013 17:13:15 +0100 Subject: KEYS: Have make canonicalise the paths of the X.509 certs better to deduplicate Have make canonicalise the paths of the X.509 certificates before we sort them as this allows $(sort) to better remove duplicates. Signed-off-by: David Howells --- kernel/Makefile | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index c34e5f993a21..2c24195249d5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -144,13 +144,19 @@ $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE ifeq ($(CONFIG_MODULE_SIG),y) ############################################################################### # -# Roll all the X.509 certificates that we can find together and pull -# them into the kernel. +# Roll all the X.509 certificates that we can find together and pull them into +# the kernel. +# +# We look in the source root and the build root for all files whose name ends +# in ".x509". Unfortunately, this will generate duplicate filenames, so we +# have make canonicalise the pathnames and then sort them to discard the +# duplicates. # ############################################################################### X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 -X509_CERTIFICATES := $(sort $(X509_CERTIFICATES-y)) +X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ + $(or $(realpath $(CERT)),$(CERT)))) ifeq ($(X509_CERTIFICATES),) $(warning *** No X.509 certificates found ***) -- cgit v1.3 From b56e5a17b6b9acd16997960504b9940d0d7984e7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Aug 2013 16:07:30 +0100 Subject: KEYS: Separate the kernel signature checking keyring from module signing Separate the kernel signature checking keyring from module signing so that it can be used by code other than the module-signing code. Signed-off-by: David Howells --- include/keys/system_keyring.h | 23 ++++++++++ init/Kconfig | 13 ++++++ kernel/Makefile | 15 ++++-- kernel/modsign_certificate.S | 11 ----- kernel/modsign_pubkey.c | 104 ------------------------------------------ kernel/module-internal.h | 2 - kernel/module_signing.c | 3 +- kernel/system_certificates.S | 12 +++++ kernel/system_keyring.c | 103 +++++++++++++++++++++++++++++++++++++++++ 9 files changed, 163 insertions(+), 123 deletions(-) create mode 100644 include/keys/system_keyring.h delete mode 100644 kernel/modsign_certificate.S delete mode 100644 kernel/modsign_pubkey.c create mode 100644 kernel/system_certificates.S create mode 100644 kernel/system_keyring.c (limited to 'kernel/Makefile') diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h new file mode 100644 index 000000000000..8dabc399bd1d --- /dev/null +++ b/include/keys/system_keyring.h @@ -0,0 +1,23 @@ +/* System keyring containing trusted public keys. + * + * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _KEYS_SYSTEM_KEYRING_H +#define _KEYS_SYSTEM_KEYRING_H + +#ifdef CONFIG_SYSTEM_TRUSTED_KEYRING + +#include + +extern struct key *system_trusted_keyring; + +#endif + +#endif /* _KEYS_SYSTEM_KEYRING_H */ diff --git a/init/Kconfig b/init/Kconfig index 3ecd8a1178f1..0ff5407a8378 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1668,6 +1668,18 @@ config BASE_SMALL default 0 if BASE_FULL default 1 if !BASE_FULL +config SYSTEM_TRUSTED_KEYRING + bool "Provide system-wide ring of trusted keys" + depends on KEYS + help + Provide a system keyring to which trusted keys can be added. Keys in + the keyring are considered to be trusted. Keys may be added at will + by the kernel from compiled-in data and from hardware key stores, but + userspace may only add extra keys if those keys can be verified by + keys already in the keyring. + + Keys in this keyring are used by module signature checking. + menuconfig MODULES bool "Enable loadable module support" option modules @@ -1741,6 +1753,7 @@ config MODULE_SRCVERSION_ALL config MODULE_SIG bool "Module signature verification" depends on MODULES + select SYSTEM_TRUSTED_KEYRING select KEYS select CRYPTO select ASYMMETRIC_KEY_TYPE diff --git a/kernel/Makefile b/kernel/Makefile index 2c24195249d5..63136989c132 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -54,8 +54,9 @@ obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o +obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o @@ -141,11 +142,11 @@ targets += timeconst.h $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE $(call if_changed,bc) -ifeq ($(CONFIG_MODULE_SIG),y) ############################################################################### # # Roll all the X.509 certificates that we can find together and pull them into -# the kernel. +# the kernel so that they get loaded into the system trusted keyring during +# boot. # # We look in the source root and the build root for all files whose name ends # in ".x509". Unfortunately, this will generate duplicate filenames, so we @@ -153,6 +154,7 @@ ifeq ($(CONFIG_MODULE_SIG),y) # duplicates. # ############################################################################### +ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ @@ -169,10 +171,11 @@ $(shell rm $(obj)/.x509.list) endif endif -kernel/modsign_certificate.o: $(obj)/x509_certificate_list +kernel/system_certificates.o: $(obj)/x509_certificate_list quiet_cmd_x509certs = CERTS $@ - cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ + cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") + targets += $(obj)/x509_certificate_list $(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list $(call if_changed,x509certs) @@ -182,7 +185,9 @@ $(obj)/.x509.list: @echo $(X509_CERTIFICATES) >$@ clean-files := x509_certificate_list .x509.list +endif +ifeq ($(CONFIG_MODULE_SIG),y) ############################################################################### # # If module signing is requested, say by allyesconfig, but a key has not been diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S deleted file mode 100644 index 6fe03c7ffe72..000000000000 --- a/kernel/modsign_certificate.S +++ /dev/null @@ -1,11 +0,0 @@ -#include - -#define GLOBAL(name) \ - .globl VMLINUX_SYMBOL(name); \ - VMLINUX_SYMBOL(name): - - .section ".init.data","aw" - -GLOBAL(modsign_certificate_list) - .incbin "kernel/x509_certificate_list" -GLOBAL(modsign_certificate_list_end) diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c deleted file mode 100644 index 7cbd4507a7e6..000000000000 --- a/kernel/modsign_pubkey.c +++ /dev/null @@ -1,104 +0,0 @@ -/* Public keys for module signature verification - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include "module-internal.h" - -struct key *modsign_keyring; - -extern __initconst const u8 modsign_certificate_list[]; -extern __initconst const u8 modsign_certificate_list_end[]; - -/* - * We need to make sure ccache doesn't cache the .o file as it doesn't notice - * if modsign.pub changes. - */ -static __initconst const char annoy_ccache[] = __TIME__ "foo"; - -/* - * Load the compiled-in keys - */ -static __init int module_verify_init(void) -{ - pr_notice("Initialise module verification\n"); - - modsign_keyring = keyring_alloc(".module_sign", - KUIDT_INIT(0), KGIDT_INIT(0), - current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(modsign_keyring)) - panic("Can't allocate module signing keyring\n"); - - return 0; -} - -/* - * Must be initialised before we try and load the keys into the keyring. - */ -device_initcall(module_verify_init); - -/* - * Load the compiled-in keys - */ -static __init int load_module_signing_keys(void) -{ - key_ref_t key; - const u8 *p, *end; - size_t plen; - - pr_notice("Loading module verification certificates\n"); - - end = modsign_certificate_list_end; - p = modsign_certificate_list; - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(modsign_keyring, 1), - "asymmetric", - NULL, - p, - plen, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW, - KEY_ALLOC_NOT_IN_QUOTA); - if (IS_ERR(key)) - pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - else - pr_notice("MODSIGN: Loaded cert '%s'\n", - key_ref_to_ptr(key)->description); - p += plen; - } - - return 0; - -dodgy_cert: - pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); - return 0; -} -late_initcall(load_module_signing_keys); diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 24f9247b7d02..915e123a430f 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -9,6 +9,4 @@ * 2 of the Licence, or (at your option) any later version. */ -extern struct key *modsign_keyring; - extern int mod_verify_sig(const void *mod, unsigned long *_modlen); diff --git a/kernel/module_signing.c b/kernel/module_signing.c index ee476404167b..0b6b870dc5e4 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "module-internal.h" /* @@ -157,7 +158,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len, pr_debug("Look up: \"%s\"\n", id); - key = keyring_search(make_key_ref(modsign_keyring, 1), + key = keyring_search(make_key_ref(system_trusted_keyring, 1), &key_type_asymmetric, id); if (IS_ERR(key)) pr_warn("Request for unknown module key '%s' err %ld\n", diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S new file mode 100644 index 000000000000..552d47b2d463 --- /dev/null +++ b/kernel/system_certificates.S @@ -0,0 +1,12 @@ +#include +#include + +#define GLOBAL(name) \ + .globl VMLINUX_SYMBOL(name); \ + VMLINUX_SYMBOL(name): + + __INITRODATA + +GLOBAL(system_certificate_list) + .incbin "kernel/x509_certificate_list" +GLOBAL(system_certificate_list_end) diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c new file mode 100644 index 000000000000..51c35141a13a --- /dev/null +++ b/kernel/system_keyring.c @@ -0,0 +1,103 @@ +/* System trusted keyring for trusted public keys + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "module-internal.h" + +struct key *system_trusted_keyring; +EXPORT_SYMBOL_GPL(system_trusted_keyring); + +extern __initconst const u8 system_certificate_list[]; +extern __initconst const u8 system_certificate_list_end[]; + +/* + * Load the compiled-in keys + */ +static __init int system_trusted_keyring_init(void) +{ + pr_notice("Initialise system trusted keyring\n"); + + system_trusted_keyring = + keyring_alloc(".system_keyring", + KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(system_trusted_keyring)) + panic("Can't allocate system trusted keyring\n"); + + return 0; +} + +/* + * Must be initialised before we try and load the keys into the keyring. + */ +device_initcall(system_trusted_keyring_init); + +/* + * Load the compiled-in list of X.509 certificates. + */ +static __init int load_system_certificate_list(void) +{ + key_ref_t key; + const u8 *p, *end; + size_t plen; + + pr_notice("Loading compiled-in X.509 certificates\n"); + + end = system_certificate_list_end; + p = system_certificate_list; + while (p < end) { + /* Each cert begins with an ASN.1 SEQUENCE tag and must be more + * than 256 bytes in size. + */ + if (end - p < 4) + goto dodgy_cert; + if (p[0] != 0x30 && + p[1] != 0x82) + goto dodgy_cert; + plen = (p[2] << 8) | p[3]; + plen += 4; + if (plen > end - p) + goto dodgy_cert; + + key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), + "asymmetric", + NULL, + p, + plen, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(key)) { + pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", + PTR_ERR(key)); + } else { + pr_notice("Loaded X.509 cert '%s'\n", + key_ref_to_ptr(key)->description); + key_ref_put(key); + } + p += plen; + } + + return 0; + +dodgy_cert: + pr_err("Problem parsing in-kernel X.509 certificate list\n"); + return 0; +} +late_initcall(load_system_certificate_list); -- cgit v1.3 From 4102adab9189c8ea2f0cdd2f88345fd25d2790f1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Oct 2013 20:23:47 -0700 Subject: rcu: Move RCU-related source code to kernel/rcu directory Signed-off-by: Paul E. McKenney Reviewed-by: Ingo Molnar --- Documentation/DocBook/device-drivers.tmpl | 5 +- Documentation/kernel-parameters.txt | 95 +- MAINTAINERS | 11 +- kernel/Makefile | 11 +- kernel/rcu.h | 132 -- kernel/rcu/Makefile | 6 + kernel/rcu/rcu.h | 132 ++ kernel/rcu/srcu.c | 651 ++++++ kernel/rcu/tiny.c | 388 ++++ kernel/rcu/tiny_plugin.h | 174 ++ kernel/rcu/torture.c | 2145 ++++++++++++++++++ kernel/rcu/tree.c | 3403 +++++++++++++++++++++++++++++ kernel/rcu/tree.h | 585 +++++ kernel/rcu/tree_plugin.h | 2831 ++++++++++++++++++++++++ kernel/rcu/tree_trace.c | 500 +++++ kernel/rcu/update.c | 347 +++ kernel/rcupdate.c | 341 --- kernel/rcutiny.c | 388 ---- kernel/rcutiny_plugin.h | 174 -- kernel/rcutorture.c | 2139 ------------------ kernel/rcutree.c | 3396 ---------------------------- kernel/rcutree.h | 585 ----- kernel/rcutree_plugin.h | 2831 ------------------------ kernel/rcutree_trace.c | 500 ----- kernel/srcu.c | 651 ------ 25 files changed, 11232 insertions(+), 11189 deletions(-) delete mode 100644 kernel/rcu.h create mode 100644 kernel/rcu/Makefile create mode 100644 kernel/rcu/rcu.h create mode 100644 kernel/rcu/srcu.c create mode 100644 kernel/rcu/tiny.c create mode 100644 kernel/rcu/tiny_plugin.h create mode 100644 kernel/rcu/torture.c create mode 100644 kernel/rcu/tree.c create mode 100644 kernel/rcu/tree.h create mode 100644 kernel/rcu/tree_plugin.h create mode 100644 kernel/rcu/tree_trace.c create mode 100644 kernel/rcu/update.c delete mode 100644 kernel/rcupdate.c delete mode 100644 kernel/rcutiny.c delete mode 100644 kernel/rcutiny_plugin.h delete mode 100644 kernel/rcutorture.c delete mode 100644 kernel/rcutree.c delete mode 100644 kernel/rcutree.h delete mode 100644 kernel/rcutree_plugin.h delete mode 100644 kernel/rcutree_trace.c delete mode 100644 kernel/srcu.c (limited to 'kernel/Makefile') diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl index fe397f90a34f..6c9d9d37c83a 100644 --- a/Documentation/DocBook/device-drivers.tmpl +++ b/Documentation/DocBook/device-drivers.tmpl @@ -87,7 +87,10 @@ X!Iinclude/linux/kobject.h !Ekernel/printk/printk.c !Ekernel/panic.c !Ekernel/sys.c -!Ekernel/rcupdate.c +!Ekernel/rcu/srcu.c +!Ekernel/rcu/tree.c +!Ekernel/rcu/tree_plugin.h +!Ekernel/rcu/update.c Device Resource Management diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1a036cd972fb..c3dc13e90a40 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2595,7 +2595,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. ramdisk_size= [RAM] Sizes of RAM disks in kilobytes See Documentation/blockdev/ramdisk.txt. - rcu_nocbs= [KNL,BOOT] + rcu_nocbs= [KNL] In kernels built with CONFIG_RCU_NOCB_CPU=y, set the specified list of CPUs to be no-callback CPUs. Invocation of these CPUs' RCU callbacks will @@ -2608,7 +2608,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. real-time workloads. It can also improve energy efficiency for asymmetric multiprocessors. - rcu_nocb_poll [KNL,BOOT] + rcu_nocb_poll [KNL] Rather than requiring that offloaded CPUs (specified by rcu_nocbs= above) explicitly awaken the corresponding "rcuoN" kthreads, @@ -2619,126 +2619,145 @@ bytes respectively. Such letter suffixes can also be entirely omitted. energy efficiency by requiring that the kthreads periodically wake up to do the polling. - rcutree.blimit= [KNL,BOOT] + rcutree.blimit= [KNL] Set maximum number of finished RCU callbacks to process in one batch. - rcutree.fanout_leaf= [KNL,BOOT] + rcutree.rcu_fanout_leaf= [KNL] Increase the number of CPUs assigned to each leaf rcu_node structure. Useful for very large systems. - rcutree.jiffies_till_first_fqs= [KNL,BOOT] + rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. Units are jiffies, minimum value is zero, and maximum value is HZ. - rcutree.jiffies_till_next_fqs= [KNL,BOOT] + rcutree.jiffies_till_next_fqs= [KNL] Set delay between subsequent attempts to force quiescent states. Units are jiffies, minimum value is one, and maximum value is HZ. - rcutree.qhimark= [KNL,BOOT] + rcutree.qhimark= [KNL] Set threshold of queued RCU callbacks over which batch limiting is disabled. - rcutree.qlowmark= [KNL,BOOT] + rcutree.qlowmark= [KNL] Set threshold of queued RCU callbacks below which batch limiting is re-enabled. - rcutree.rcu_cpu_stall_suppress= [KNL,BOOT] - Suppress RCU CPU stall warning messages. - - rcutree.rcu_cpu_stall_timeout= [KNL,BOOT] - Set timeout for RCU CPU stall warning messages. - - rcutree.rcu_idle_gp_delay= [KNL,BOOT] + rcutree.rcu_idle_gp_delay= [KNL] Set wakeup interval for idle CPUs that have RCU callbacks (RCU_FAST_NO_HZ=y). - rcutree.rcu_idle_lazy_gp_delay= [KNL,BOOT] + rcutree.rcu_idle_lazy_gp_delay= [KNL] Set wakeup interval for idle CPUs that have only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y). Lazy RCU callbacks are those which RCU can prove do nothing more than free memory. - rcutorture.fqs_duration= [KNL,BOOT] + rcutorture.fqs_duration= [KNL] Set duration of force_quiescent_state bursts. - rcutorture.fqs_holdoff= [KNL,BOOT] + rcutorture.fqs_holdoff= [KNL] Set holdoff time within force_quiescent_state bursts. - rcutorture.fqs_stutter= [KNL,BOOT] + rcutorture.fqs_stutter= [KNL] Set wait time between force_quiescent_state bursts. - rcutorture.irqreader= [KNL,BOOT] - Test RCU readers from irq handlers. + rcutorture.gp_exp= [KNL] + Use expedited update-side primitives. + + rcutorture.gp_normal= [KNL] + Use normal (non-expedited) update-side primitives. + If both gp_exp and gp_normal are set, do both. + If neither gp_exp nor gp_normal are set, still + do both. - rcutorture.n_barrier_cbs= [KNL,BOOT] + rcutorture.n_barrier_cbs= [KNL] Set callbacks/threads for rcu_barrier() testing. - rcutorture.nfakewriters= [KNL,BOOT] + rcutorture.nfakewriters= [KNL] Set number of concurrent RCU writers. These just stress RCU, they don't participate in the actual test, hence the "fake". - rcutorture.nreaders= [KNL,BOOT] + rcutorture.nreaders= [KNL] Set number of RCU readers. - rcutorture.onoff_holdoff= [KNL,BOOT] + rcutorture.object_debug= [KNL] + Enable debug-object double-call_rcu() testing. + + rcutorture.onoff_holdoff= [KNL] Set time (s) after boot for CPU-hotplug testing. - rcutorture.onoff_interval= [KNL,BOOT] + rcutorture.onoff_interval= [KNL] Set time (s) between CPU-hotplug operations, or zero to disable CPU-hotplug testing. - rcutorture.shuffle_interval= [KNL,BOOT] + rcutorture.rcutorture_runnable= [BOOT] + Start rcutorture running at boot time. + + rcutorture.shuffle_interval= [KNL] Set task-shuffle interval (s). Shuffling tasks allows some CPUs to go into dyntick-idle mode during the rcutorture test. - rcutorture.shutdown_secs= [KNL,BOOT] + rcutorture.shutdown_secs= [KNL] Set time (s) after boot system shutdown. This is useful for hands-off automated testing. - rcutorture.stall_cpu= [KNL,BOOT] + rcutorture.stall_cpu= [KNL] Duration of CPU stall (s) to test RCU CPU stall warnings, zero to disable. - rcutorture.stall_cpu_holdoff= [KNL,BOOT] + rcutorture.stall_cpu_holdoff= [KNL] Time to wait (s) after boot before inducing stall. - rcutorture.stat_interval= [KNL,BOOT] + rcutorture.stat_interval= [KNL] Time (s) between statistics printk()s. - rcutorture.stutter= [KNL,BOOT] + rcutorture.stutter= [KNL] Time (s) to stutter testing, for example, specifying five seconds causes the test to run for five seconds, wait for five seconds, and so on. This tests RCU's ability to transition abruptly to and from idle. - rcutorture.test_boost= [KNL,BOOT] + rcutorture.test_boost= [KNL] Test RCU priority boosting? 0=no, 1=maybe, 2=yes. "Maybe" means test if the RCU implementation under test support RCU priority boosting. - rcutorture.test_boost_duration= [KNL,BOOT] + rcutorture.test_boost_duration= [KNL] Duration (s) of each individual boost test. - rcutorture.test_boost_interval= [KNL,BOOT] + rcutorture.test_boost_interval= [KNL] Interval (s) between each boost test. - rcutorture.test_no_idle_hz= [KNL,BOOT] + rcutorture.test_no_idle_hz= [KNL] Test RCU's dyntick-idle handling. See also the rcutorture.shuffle_interval parameter. - rcutorture.torture_type= [KNL,BOOT] + rcutorture.torture_type= [KNL] Specify the RCU implementation to test. - rcutorture.verbose= [KNL,BOOT] + rcutorture.verbose= [KNL] Enable additional printk() statements. + rcupdate.rcu_expedited= [KNL] + Use expedited grace-period primitives, for + example, synchronize_rcu_expedited() instead + of synchronize_rcu(). This reduces latency, + but can increase CPU utilization, degrade + real-time latency, and degrade energy efficiency. + + rcupdate.rcu_cpu_stall_suppress= [KNL] + Suppress RCU CPU stall warning messages. + + rcupdate.rcu_cpu_stall_timeout= [KNL] + Set timeout for RCU CPU stall warning messages. + rdinit= [KNL] Format: Run specified binary instead of /init from the ramdisk, diff --git a/MAINTAINERS b/MAINTAINERS index e61c2e83fc2b..28f2478b6794 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6903,7 +6903,7 @@ M: "Paul E. McKenney" S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git F: Documentation/RCU/torture.txt -F: kernel/rcutorture.c +F: kernel/rcu/torture.c RDC R-321X SoC M: Florian Fainelli @@ -6930,8 +6930,9 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git F: Documentation/RCU/ X: Documentation/RCU/torture.txt F: include/linux/rcu* -F: kernel/rcu* -X: kernel/rcutorture.c +X: include/linux/srcu.h +F: kernel/rcu/ +X: kernel/rcu/torture.c REAL TIME CLOCK (RTC) SUBSYSTEM M: Alessandro Zummo @@ -7618,8 +7619,8 @@ M: "Paul E. McKenney" W: http://www.rdrop.com/users/paulmck/RCU/ S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git -F: include/linux/srcu* -F: kernel/srcu* +F: include/linux/srcu.h +F: kernel/rcu/srcu.c SMACK SECURITY MODULE M: Casey Schaufler diff --git a/kernel/Makefile b/kernel/Makefile index 1ce47553fb02..f99d908b5550 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -6,9 +6,9 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ - rcupdate.o extable.o params.o posix-timers.o \ + extable.o params.o posix-timers.o \ kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ + hrtimer.o rwsem.o nsproxy.o semaphore.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o @@ -27,6 +27,7 @@ obj-y += power/ obj-y += printk/ obj-y += cpu/ obj-y += irq/ +obj-y += rcu/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -81,12 +82,6 @@ obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_SECCOMP) += seccomp.o -obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_TREE_RCU) += rcutree.o -obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o -obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o -obj-$(CONFIG_TINY_RCU) += rcutiny.o -obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o diff --git a/kernel/rcu.h b/kernel/rcu.h deleted file mode 100644 index 7859a0a3951e..000000000000 --- a/kernel/rcu.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Read-Copy Update definitions shared among RCU implementations. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2011 - * - * Author: Paul E. McKenney - */ - -#ifndef __LINUX_RCU_H -#define __LINUX_RCU_H - -#ifdef CONFIG_RCU_TRACE -#define RCU_TRACE(stmt) stmt -#else /* #ifdef CONFIG_RCU_TRACE */ -#define RCU_TRACE(stmt) -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - -/* - * Process-level increment to ->dynticks_nesting field. This allows for - * architectures that use half-interrupts and half-exceptions from - * process context. - * - * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH - * that counts the number of process-based reasons why RCU cannot - * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE - * is the value used to increment or decrement this field. - * - * The rest of the bits could in principle be used to count interrupts, - * but this would mean that a negative-one value in the interrupt - * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field. - * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK - * that is set to DYNTICK_TASK_FLAG upon initial exit from idle. - * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon - * initial exit from idle. - */ -#define DYNTICK_TASK_NEST_WIDTH 7 -#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1) -#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1) -#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2) -#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3) -#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ - DYNTICK_TASK_FLAG) - -/* - * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally - * by call_rcu() and rcu callback execution, and are therefore not part of the - * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. - */ - -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD -# define STATE_RCU_HEAD_READY 0 -# define STATE_RCU_HEAD_QUEUED 1 - -extern struct debug_obj_descr rcuhead_debug_descr; - -static inline int debug_rcu_head_queue(struct rcu_head *head) -{ - int r1; - - r1 = debug_object_activate(head, &rcuhead_debug_descr); - debug_object_active_state(head, &rcuhead_debug_descr, - STATE_RCU_HEAD_READY, - STATE_RCU_HEAD_QUEUED); - return r1; -} - -static inline void debug_rcu_head_unqueue(struct rcu_head *head) -{ - debug_object_active_state(head, &rcuhead_debug_descr, - STATE_RCU_HEAD_QUEUED, - STATE_RCU_HEAD_READY); - debug_object_deactivate(head, &rcuhead_debug_descr); -} -#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -static inline int debug_rcu_head_queue(struct rcu_head *head) -{ - return 0; -} - -static inline void debug_rcu_head_unqueue(struct rcu_head *head) -{ -} -#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - -extern void kfree(const void *); - -static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) -{ - unsigned long offset = (unsigned long)head->func; - - if (__is_kfree_rcu_offset(offset)) { - RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); - kfree((void *)head - offset); - return 1; - } else { - RCU_TRACE(trace_rcu_invoke_callback(rn, head)); - head->func(head); - return 0; - } -} - -extern int rcu_expedited; - -#ifdef CONFIG_RCU_STALL_COMMON - -extern int rcu_cpu_stall_suppress; -int rcu_jiffies_till_stall_check(void); - -#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ - -/* - * Strings used in tracepoints need to be exported via the - * tracing system such that tools like perf and trace-cmd can - * translate the string address pointers to actual text. - */ -#define TPS(x) tracepoint_string(x) - -#endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile new file mode 100644 index 000000000000..01e9ec37a3e3 --- /dev/null +++ b/kernel/rcu/Makefile @@ -0,0 +1,6 @@ +obj-y += update.o srcu.o +obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o +obj-$(CONFIG_TREE_RCU) += tree.o +obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o +obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o +obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h new file mode 100644 index 000000000000..7859a0a3951e --- /dev/null +++ b/kernel/rcu/rcu.h @@ -0,0 +1,132 @@ +/* + * Read-Copy Update definitions shared among RCU implementations. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2011 + * + * Author: Paul E. McKenney + */ + +#ifndef __LINUX_RCU_H +#define __LINUX_RCU_H + +#ifdef CONFIG_RCU_TRACE +#define RCU_TRACE(stmt) stmt +#else /* #ifdef CONFIG_RCU_TRACE */ +#define RCU_TRACE(stmt) +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +/* + * Process-level increment to ->dynticks_nesting field. This allows for + * architectures that use half-interrupts and half-exceptions from + * process context. + * + * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH + * that counts the number of process-based reasons why RCU cannot + * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE + * is the value used to increment or decrement this field. + * + * The rest of the bits could in principle be used to count interrupts, + * but this would mean that a negative-one value in the interrupt + * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field. + * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK + * that is set to DYNTICK_TASK_FLAG upon initial exit from idle. + * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon + * initial exit from idle. + */ +#define DYNTICK_TASK_NEST_WIDTH 7 +#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1) +#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1) +#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2) +#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3) +#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ + DYNTICK_TASK_FLAG) + +/* + * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally + * by call_rcu() and rcu callback execution, and are therefore not part of the + * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. + */ + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +# define STATE_RCU_HEAD_READY 0 +# define STATE_RCU_HEAD_QUEUED 1 + +extern struct debug_obj_descr rcuhead_debug_descr; + +static inline int debug_rcu_head_queue(struct rcu_head *head) +{ + int r1; + + r1 = debug_object_activate(head, &rcuhead_debug_descr); + debug_object_active_state(head, &rcuhead_debug_descr, + STATE_RCU_HEAD_READY, + STATE_RCU_HEAD_QUEUED); + return r1; +} + +static inline void debug_rcu_head_unqueue(struct rcu_head *head) +{ + debug_object_active_state(head, &rcuhead_debug_descr, + STATE_RCU_HEAD_QUEUED, + STATE_RCU_HEAD_READY); + debug_object_deactivate(head, &rcuhead_debug_descr); +} +#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline int debug_rcu_head_queue(struct rcu_head *head) +{ + return 0; +} + +static inline void debug_rcu_head_unqueue(struct rcu_head *head) +{ +} +#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +extern void kfree(const void *); + +static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) +{ + unsigned long offset = (unsigned long)head->func; + + if (__is_kfree_rcu_offset(offset)) { + RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); + kfree((void *)head - offset); + return 1; + } else { + RCU_TRACE(trace_rcu_invoke_callback(rn, head)); + head->func(head); + return 0; + } +} + +extern int rcu_expedited; + +#ifdef CONFIG_RCU_STALL_COMMON + +extern int rcu_cpu_stall_suppress; +int rcu_jiffies_till_stall_check(void); + +#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ + +/* + * Strings used in tracepoints need to be exported via the + * tracing system such that tools like perf and trace-cmd can + * translate the string address pointers to actual text. + */ +#define TPS(x) tracepoint_string(x) + +#endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c new file mode 100644 index 000000000000..01d5ccb8bfe3 --- /dev/null +++ b/kernel/rcu/srcu.c @@ -0,0 +1,651 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + * Copyright (C) Fujitsu, 2012 + * + * Author: Paul McKenney + * Lai Jiangshan + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "rcu.h" + +/* + * Initialize an rcu_batch structure to empty. + */ +static inline void rcu_batch_init(struct rcu_batch *b) +{ + b->head = NULL; + b->tail = &b->head; +} + +/* + * Enqueue a callback onto the tail of the specified rcu_batch structure. + */ +static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) +{ + *b->tail = head; + b->tail = &head->next; +} + +/* + * Is the specified rcu_batch structure empty? + */ +static inline bool rcu_batch_empty(struct rcu_batch *b) +{ + return b->tail == &b->head; +} + +/* + * Remove the callback at the head of the specified rcu_batch structure + * and return a pointer to it, or return NULL if the structure is empty. + */ +static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) +{ + struct rcu_head *head; + + if (rcu_batch_empty(b)) + return NULL; + + head = b->head; + b->head = head->next; + if (b->tail == &head->next) + rcu_batch_init(b); + + return head; +} + +/* + * Move all callbacks from the rcu_batch structure specified by "from" to + * the structure specified by "to". + */ +static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) +{ + if (!rcu_batch_empty(from)) { + *to->tail = from->head; + to->tail = from->tail; + rcu_batch_init(from); + } +} + +static int init_srcu_struct_fields(struct srcu_struct *sp) +{ + sp->completed = 0; + spin_lock_init(&sp->queue_lock); + sp->running = false; + rcu_batch_init(&sp->batch_queue); + rcu_batch_init(&sp->batch_check0); + rcu_batch_init(&sp->batch_check1); + rcu_batch_init(&sp->batch_done); + INIT_DELAYED_WORK(&sp->work, process_srcu); + sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); + return sp->per_cpu_ref ? 0 : -ENOMEM; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int __init_srcu_struct(struct srcu_struct *sp, const char *name, + struct lock_class_key *key) +{ + /* Don't re-initialize a lock while it is held. */ + debug_check_no_locks_freed((void *)sp, sizeof(*sp)); + lockdep_init_map(&sp->dep_map, name, key, 0); + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct); + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/** + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct(struct srcu_struct *sp) +{ + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(init_srcu_struct); + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * Returns approximate total of the readers' ->seq[] values for the + * rank of per-CPU counters specified by idx. + */ +static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + unsigned long t; + + for_each_possible_cpu(cpu) { + t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); + sum += t; + } + return sum; +} + +/* + * Returns approximate number of readers active on the specified rank + * of the per-CPU ->c[] counters. + */ +static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + unsigned long t; + + for_each_possible_cpu(cpu) { + t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); + sum += t; + } + return sum; +} + +/* + * Return true if the number of pre-existing readers is determined to + * be stably zero. An example unstable zero can occur if the call + * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, + * but due to task migration, sees the corresponding __srcu_read_unlock() + * decrement. This can happen because srcu_readers_active_idx() takes + * time to sum the array, and might in fact be interrupted or preempted + * partway through the summation. + */ +static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) +{ + unsigned long seq; + + seq = srcu_readers_seq_idx(sp, idx); + + /* + * The following smp_mb() A pairs with the smp_mb() B located in + * __srcu_read_lock(). This pairing ensures that if an + * __srcu_read_lock() increments its counter after the summation + * in srcu_readers_active_idx(), then the corresponding SRCU read-side + * critical section will see any changes made prior to the start + * of the current SRCU grace period. + * + * Also, if the above call to srcu_readers_seq_idx() saw the + * increment of ->seq[], then the call to srcu_readers_active_idx() + * must see the increment of ->c[]. + */ + smp_mb(); /* A */ + + /* + * Note that srcu_readers_active_idx() can incorrectly return + * zero even though there is a pre-existing reader throughout. + * To see this, suppose that task A is in a very long SRCU + * read-side critical section that started on CPU 0, and that + * no other reader exists, so that the sum of the counters + * is equal to one. Then suppose that task B starts executing + * srcu_readers_active_idx(), summing up to CPU 1, and then that + * task C starts reading on CPU 0, so that its increment is not + * summed, but finishes reading on CPU 2, so that its decrement + * -is- summed. Then when task B completes its sum, it will + * incorrectly get zero, despite the fact that task A has been + * in its SRCU read-side critical section the whole time. + * + * We therefore do a validation step should srcu_readers_active_idx() + * return zero. + */ + if (srcu_readers_active_idx(sp, idx) != 0) + return false; + + /* + * The remainder of this function is the validation step. + * The following smp_mb() D pairs with the smp_mb() C in + * __srcu_read_unlock(). If the __srcu_read_unlock() was seen + * by srcu_readers_active_idx() above, then any destructive + * operation performed after the grace period will happen after + * the corresponding SRCU read-side critical section. + * + * Note that there can be at most NR_CPUS worth of readers using + * the old index, which is not enough to overflow even a 32-bit + * integer. (Yes, this does mean that systems having more than + * a billion or so CPUs need to be 64-bit systems.) Therefore, + * the sum of the ->seq[] counters cannot possibly overflow. + * Therefore, the only way that the return values of the two + * calls to srcu_readers_seq_idx() can be equal is if there were + * no increments of the corresponding rank of ->seq[] counts + * in the interim. But the missed-increment scenario laid out + * above includes an increment of the ->seq[] counter by + * the corresponding __srcu_read_lock(). Therefore, if this + * scenario occurs, the return values from the two calls to + * srcu_readers_seq_idx() will differ, and thus the validation + * step below suffices. + */ + smp_mb(); /* D */ + + return srcu_readers_seq_idx(sp, idx) == seq; +} + +/** + * srcu_readers_active - returns approximate number of readers. + * @sp: which srcu_struct to count active readers (holding srcu_read_lock). + * + * Note that this is not an atomic primitive, and can therefore suffer + * severe errors when invoked on an active srcu_struct. That said, it + * can be useful as an error check at cleanup time. + */ +static int srcu_readers_active(struct srcu_struct *sp) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); + sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); + } + return sum; +} + +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + if (WARN_ON(srcu_readers_active(sp))) + return; /* Leakage unless caller handles error. */ + free_percpu(sp->per_cpu_ref); + sp->per_cpu_ref = NULL; +} +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); + +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Must be called from process context. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int __srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + idx = ACCESS_ONCE(sp->completed) & 0x1; + preempt_disable(); + ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; + smp_mb(); /* B */ /* Avoid leaking the critical section. */ + ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; + preempt_enable(); + return idx; +} +EXPORT_SYMBOL_GPL(__srcu_read_lock); + +/* + * Removes the count for the old reader from the appropriate per-CPU + * element of the srcu_struct. Note that this may well be a different + * CPU than that which was incremented by the corresponding srcu_read_lock(). + * Must be called from process context. + */ +void __srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + smp_mb(); /* C */ /* Avoid leaking the critical section. */ + this_cpu_dec(sp->per_cpu_ref->c[idx]); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock); + +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below) to allow SRCU readers to exit their read-side critical + * sections. If there are still some readers after 10 microseconds, + * we repeatedly block for 1-millisecond time periods. This approach + * has done well in testing, so there is no need for a config parameter. + */ +#define SRCU_RETRY_CHECK_DELAY 5 +#define SYNCHRONIZE_SRCU_TRYCOUNT 2 +#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 + +/* + * @@@ Wait until all pre-existing readers complete. Such readers + * will have used the index specified by "idx". + * the caller should ensures the ->completed is not changed while checking + * and idx = (->completed & 1) ^ 1 + */ +static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) +{ + for (;;) { + if (srcu_readers_active_idx_check(sp, idx)) + return true; + if (--trycount <= 0) + return false; + udelay(SRCU_RETRY_CHECK_DELAY); + } +} + +/* + * Increment the ->completed counter so that future SRCU readers will + * use the other rank of the ->c[] and ->seq[] arrays. This allows + * us to wait for pre-existing readers in a starvation-free manner. + */ +static void srcu_flip(struct srcu_struct *sp) +{ + sp->completed++; +} + +/* + * Enqueue an SRCU callback on the specified srcu_struct structure, + * initiating grace-period processing if it is not already running. + */ +void call_srcu(struct srcu_struct *sp, struct rcu_head *head, + void (*func)(struct rcu_head *head)) +{ + unsigned long flags; + + head->next = NULL; + head->func = func; + spin_lock_irqsave(&sp->queue_lock, flags); + rcu_batch_queue(&sp->batch_queue, head); + if (!sp->running) { + sp->running = true; + schedule_delayed_work(&sp->work, 0); + } + spin_unlock_irqrestore(&sp->queue_lock, flags); +} +EXPORT_SYMBOL_GPL(call_srcu); + +struct rcu_synchronize { + struct rcu_head head; + struct completion completion; +}; + +/* + * Awaken the corresponding synchronize_srcu() instance now that a + * grace period has elapsed. + */ +static void wakeme_after_rcu(struct rcu_head *head) +{ + struct rcu_synchronize *rcu; + + rcu = container_of(head, struct rcu_synchronize, head); + complete(&rcu->completion); +} + +static void srcu_advance_batches(struct srcu_struct *sp, int trycount); +static void srcu_reschedule(struct srcu_struct *sp); + +/* + * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). + */ +static void __synchronize_srcu(struct srcu_struct *sp, int trycount) +{ + struct rcu_synchronize rcu; + struct rcu_head *head = &rcu.head; + bool done = false; + + rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && + !lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); + + might_sleep(); + init_completion(&rcu.completion); + + head->next = NULL; + head->func = wakeme_after_rcu; + spin_lock_irq(&sp->queue_lock); + if (!sp->running) { + /* steal the processing owner */ + sp->running = true; + rcu_batch_queue(&sp->batch_check0, head); + spin_unlock_irq(&sp->queue_lock); + + srcu_advance_batches(sp, trycount); + if (!rcu_batch_empty(&sp->batch_done)) { + BUG_ON(sp->batch_done.head != head); + rcu_batch_dequeue(&sp->batch_done); + done = true; + } + /* give the processing owner to work_struct */ + srcu_reschedule(sp); + } else { + rcu_batch_queue(&sp->batch_queue, head); + spin_unlock_irq(&sp->queue_lock); + } + + if (!done) + wait_for_completion(&rcu.completion); +} + +/** + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + * @sp: srcu_struct with which to synchronize. + * + * Wait for the count to drain to zero of both indexes. To avoid the + * possible starvation of synchronize_srcu(), it waits for the count of + * the index=((->completed & 1) ^ 1) to drain to zero at first, + * and then flip the completed and wait for the count of the other index. + * + * Can block; must be called from process context. + * + * Note that it is illegal to call synchronize_srcu() from the corresponding + * SRCU read-side critical section; doing so will result in deadlock. + * However, it is perfectly legal to call synchronize_srcu() on one + * srcu_struct from some other srcu_struct's read-side critical section. + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, rcu_expedited + ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT + : SYNCHRONIZE_SRCU_TRYCOUNT); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); + +/** + * synchronize_srcu_expedited - Brute-force SRCU grace period + * @sp: srcu_struct with which to synchronize. + * + * Wait for an SRCU grace period to elapse, but be more aggressive about + * spinning rather than blocking when waiting. + * + * Note that it is also illegal to call synchronize_srcu_expedited() + * from the corresponding SRCU read-side critical section; + * doing so will result in deadlock. However, it is perfectly legal + * to call synchronize_srcu_expedited() on one srcu_struct from some + * other srcu_struct's read-side critical section, as long as + * the resulting graph of srcu_structs is acyclic. + */ +void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); +} +EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); + +/** + * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. + */ +void srcu_barrier(struct srcu_struct *sp) +{ + synchronize_srcu(sp); +} +EXPORT_SYMBOL_GPL(srcu_barrier); + +/** + * srcu_batches_completed - return batches completed. + * @sp: srcu_struct on which to report batch completion. + * + * Report the number of batches, correlated with, but not necessarily + * precisely the same as, the number of grace periods that have elapsed. + */ +long srcu_batches_completed(struct srcu_struct *sp) +{ + return sp->completed; +} +EXPORT_SYMBOL_GPL(srcu_batches_completed); + +#define SRCU_CALLBACK_BATCH 10 +#define SRCU_INTERVAL 1 + +/* + * Move any new SRCU callbacks to the first stage of the SRCU grace + * period pipeline. + */ +static void srcu_collect_new(struct srcu_struct *sp) +{ + if (!rcu_batch_empty(&sp->batch_queue)) { + spin_lock_irq(&sp->queue_lock); + rcu_batch_move(&sp->batch_check0, &sp->batch_queue); + spin_unlock_irq(&sp->queue_lock); + } +} + +/* + * Core SRCU state machine. Advance callbacks from ->batch_check0 to + * ->batch_check1 and then to ->batch_done as readers drain. + */ +static void srcu_advance_batches(struct srcu_struct *sp, int trycount) +{ + int idx = 1 ^ (sp->completed & 1); + + /* + * Because readers might be delayed for an extended period after + * fetching ->completed for their index, at any point in time there + * might well be readers using both idx=0 and idx=1. We therefore + * need to wait for readers to clear from both index values before + * invoking a callback. + */ + + if (rcu_batch_empty(&sp->batch_check0) && + rcu_batch_empty(&sp->batch_check1)) + return; /* no callbacks need to be advanced */ + + if (!try_check_zero(sp, idx, trycount)) + return; /* failed to advance, will try after SRCU_INTERVAL */ + + /* + * The callbacks in ->batch_check1 have already done with their + * first zero check and flip back when they were enqueued on + * ->batch_check0 in a previous invocation of srcu_advance_batches(). + * (Presumably try_check_zero() returned false during that + * invocation, leaving the callbacks stranded on ->batch_check1.) + * They are therefore ready to invoke, so move them to ->batch_done. + */ + rcu_batch_move(&sp->batch_done, &sp->batch_check1); + + if (rcu_batch_empty(&sp->batch_check0)) + return; /* no callbacks need to be advanced */ + srcu_flip(sp); + + /* + * The callbacks in ->batch_check0 just finished their + * first check zero and flip, so move them to ->batch_check1 + * for future checking on the other idx. + */ + rcu_batch_move(&sp->batch_check1, &sp->batch_check0); + + /* + * SRCU read-side critical sections are normally short, so check + * at least twice in quick succession after a flip. + */ + trycount = trycount < 2 ? 2 : trycount; + if (!try_check_zero(sp, idx^1, trycount)) + return; /* failed to advance, will try after SRCU_INTERVAL */ + + /* + * The callbacks in ->batch_check1 have now waited for all + * pre-existing readers using both idx values. They are therefore + * ready to invoke, so move them to ->batch_done. + */ + rcu_batch_move(&sp->batch_done, &sp->batch_check1); +} + +/* + * Invoke a limited number of SRCU callbacks that have passed through + * their grace period. If there are more to do, SRCU will reschedule + * the workqueue. + */ +static void srcu_invoke_callbacks(struct srcu_struct *sp) +{ + int i; + struct rcu_head *head; + + for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { + head = rcu_batch_dequeue(&sp->batch_done); + if (!head) + break; + local_bh_disable(); + head->func(head); + local_bh_enable(); + } +} + +/* + * Finished one round of SRCU grace period. Start another if there are + * more SRCU callbacks queued, otherwise put SRCU into not-running state. + */ +static void srcu_reschedule(struct srcu_struct *sp) +{ + bool pending = true; + + if (rcu_batch_empty(&sp->batch_done) && + rcu_batch_empty(&sp->batch_check1) && + rcu_batch_empty(&sp->batch_check0) && + rcu_batch_empty(&sp->batch_queue)) { + spin_lock_irq(&sp->queue_lock); + if (rcu_batch_empty(&sp->batch_done) && + rcu_batch_empty(&sp->batch_check1) && + rcu_batch_empty(&sp->batch_check0) && + rcu_batch_empty(&sp->batch_queue)) { + sp->running = false; + pending = false; + } + spin_unlock_irq(&sp->queue_lock); + } + + if (pending) + schedule_delayed_work(&sp->work, SRCU_INTERVAL); +} + +/* + * This is the work-queue function that handles SRCU grace periods. + */ +void process_srcu(struct work_struct *work) +{ + struct srcu_struct *sp; + + sp = container_of(work, struct srcu_struct, work.work); + + srcu_collect_new(sp); + srcu_advance_batches(sp, 1); + srcu_invoke_callbacks(sp); + srcu_reschedule(sp); +} +EXPORT_SYMBOL_GPL(process_srcu); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c new file mode 100644 index 000000000000..0c9a934cfec1 --- /dev/null +++ b/kernel/rcu/tiny.c @@ -0,0 +1,388 @@ +/* + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Author: Paul E. McKenney + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_RCU_TRACE +#include +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +#include "rcu.h" + +/* Forward declarations for tiny_plugin.h. */ +struct rcu_ctrlblk; +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static void rcu_process_callbacks(struct softirq_action *unused); +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp); + +static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + +#include "tiny_plugin.h" + +/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ +static void rcu_idle_enter_common(long long newval) +{ + if (newval) { + RCU_TRACE(trace_rcu_dyntick(TPS("--="), + rcu_dynticks_nesting, newval)); + rcu_dynticks_nesting = newval; + return; + } + RCU_TRACE(trace_rcu_dyntick(TPS("Start"), + rcu_dynticks_nesting, newval)); + if (!is_idle_task(current)) { + struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); + + RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), + rcu_dynticks_nesting, newval)); + ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ + } + rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ + barrier(); + rcu_dynticks_nesting = newval; +} + +/* + * Enter idle, which is an extended quiescent state if we have fully + * entered that mode (i.e., if the new value of dynticks_nesting is zero). + */ +void rcu_idle_enter(void) +{ + unsigned long flags; + long long newval; + + local_irq_save(flags); + WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); + if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == + DYNTICK_TASK_NEST_VALUE) + newval = 0; + else + newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; + rcu_idle_enter_common(newval); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(rcu_idle_enter); + +/* + * Exit an interrupt handler towards idle. + */ +void rcu_irq_exit(void) +{ + unsigned long flags; + long long newval; + + local_irq_save(flags); + newval = rcu_dynticks_nesting - 1; + WARN_ON_ONCE(newval < 0); + rcu_idle_enter_common(newval); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(rcu_irq_exit); + +/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ +static void rcu_idle_exit_common(long long oldval) +{ + if (oldval) { + RCU_TRACE(trace_rcu_dyntick(TPS("++="), + oldval, rcu_dynticks_nesting)); + return; + } + RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); + if (!is_idle_task(current)) { + struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); + + RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), + oldval, rcu_dynticks_nesting)); + ftrace_dump(DUMP_ALL); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ + } +} + +/* + * Exit idle, so that we are no longer in an extended quiescent state. + */ +void rcu_idle_exit(void) +{ + unsigned long flags; + long long oldval; + + local_irq_save(flags); + oldval = rcu_dynticks_nesting; + WARN_ON_ONCE(rcu_dynticks_nesting < 0); + if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) + rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE; + else + rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rcu_idle_exit_common(oldval); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(rcu_idle_exit); + +/* + * Enter an interrupt handler, moving away from idle. + */ +void rcu_irq_enter(void) +{ + unsigned long flags; + long long oldval; + + local_irq_save(flags); + oldval = rcu_dynticks_nesting; + rcu_dynticks_nesting++; + WARN_ON_ONCE(rcu_dynticks_nesting == 0); + rcu_idle_exit_common(oldval); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(rcu_irq_enter); + +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) + +/* + * Test whether RCU thinks that the current CPU is idle. + */ +bool __rcu_is_watching(void) +{ + return rcu_dynticks_nesting; +} +EXPORT_SYMBOL(__rcu_is_watching); + +#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ + +/* + * Test whether the current CPU was interrupted from idle. Nested + * interrupts don't count, we must be running at the first interrupt + * level. + */ +static int rcu_is_cpu_rrupt_from_idle(void) +{ + return rcu_dynticks_nesting <= 1; +} + +/* + * Helper function for rcu_sched_qs() and rcu_bh_qs(). + * Also irqs are disabled to avoid confusion due to interrupt handlers + * invoking call_rcu(). + */ +static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) +{ + RCU_TRACE(reset_cpu_stall_ticks(rcp)); + if (rcp->rcucblist != NULL && + rcp->donetail != rcp->curtail) { + rcp->donetail = rcp->curtail; + return 1; + } + + return 0; +} + +/* + * Record an rcu quiescent state. And an rcu_bh quiescent state while we + * are at it, given that any rcu quiescent state is also an rcu_bh + * quiescent state. Use "+" instead of "||" to defeat short circuiting. + */ +void rcu_sched_qs(int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + if (rcu_qsctr_help(&rcu_sched_ctrlblk) + + rcu_qsctr_help(&rcu_bh_ctrlblk)) + raise_softirq(RCU_SOFTIRQ); + local_irq_restore(flags); +} + +/* + * Record an rcu_bh quiescent state. + */ +void rcu_bh_qs(int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + if (rcu_qsctr_help(&rcu_bh_ctrlblk)) + raise_softirq(RCU_SOFTIRQ); + local_irq_restore(flags); +} + +/* + * Check to see if the scheduling-clock interrupt came from an extended + * quiescent state, and, if so, tell RCU about it. This function must + * be called from hardirq context. It is normally called from the + * scheduling-clock interrupt. + */ +void rcu_check_callbacks(int cpu, int user) +{ + RCU_TRACE(check_cpu_stalls()); + if (user || rcu_is_cpu_rrupt_from_idle()) + rcu_sched_qs(cpu); + else if (!in_softirq()) + rcu_bh_qs(cpu); +} + +/* + * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure + * whose grace period has elapsed. + */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) +{ + const char *rn = NULL; + struct rcu_head *next, *list; + unsigned long flags; + RCU_TRACE(int cb_count = 0); + + /* If no RCU callbacks ready to invoke, just return. */ + if (&rcp->rcucblist == rcp->donetail) { + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); + RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, + !!ACCESS_ONCE(rcp->rcucblist), + need_resched(), + is_idle_task(current), + false)); + return; + } + + /* Move the ready-to-invoke callbacks to a local list. */ + local_irq_save(flags); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); + list = rcp->rcucblist; + rcp->rcucblist = *rcp->donetail; + *rcp->donetail = NULL; + if (rcp->curtail == rcp->donetail) + rcp->curtail = &rcp->rcucblist; + rcp->donetail = &rcp->rcucblist; + local_irq_restore(flags); + + /* Invoke the callbacks on the local list. */ + RCU_TRACE(rn = rcp->name); + while (list) { + next = list->next; + prefetch(next); + debug_rcu_head_unqueue(list); + local_bh_disable(); + __rcu_reclaim(rn, list); + local_bh_enable(); + list = next; + RCU_TRACE(cb_count++); + } + RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); + RCU_TRACE(trace_rcu_batch_end(rcp->name, + cb_count, 0, need_resched(), + is_idle_task(current), + false)); +} + +static void rcu_process_callbacks(struct softirq_action *unused) +{ + __rcu_process_callbacks(&rcu_sched_ctrlblk); + __rcu_process_callbacks(&rcu_bh_ctrlblk); +} + +/* + * Wait for a grace period to elapse. But it is illegal to invoke + * synchronize_sched() from within an RCU read-side critical section. + * Therefore, any legal call to synchronize_sched() is a quiescent + * state, and so on a UP system, synchronize_sched() need do nothing. + * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the + * benefits of doing might_sleep() to reduce latency.) + * + * Cool, huh? (Due to Josh Triplett.) + * + * But we want to make this a static inline later. The cond_resched() + * currently makes this problematic. + */ +void synchronize_sched(void) +{ + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU read-side critical section"); + cond_resched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + +/* + * Helper function for call_rcu() and call_rcu_bh(). + */ +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + debug_rcu_head_queue(head); + head->func = func; + head->next = NULL; + + local_irq_save(flags); + *rcp->curtail = head; + rcp->curtail = &head->next; + RCU_TRACE(rcp->qlen++); + local_irq_restore(flags); +} + +/* + * Post an RCU callback to be invoked after the end of an RCU-sched grace + * period. But since we have but one CPU, that would be after any + * quiescent state. + */ +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_sched_ctrlblk); +} +EXPORT_SYMBOL_GPL(call_rcu_sched); + +/* + * Post an RCU bottom-half callback to be invoked after any subsequent + * quiescent state. + */ +void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_bh_ctrlblk); +} +EXPORT_SYMBOL_GPL(call_rcu_bh); + +void rcu_init(void) +{ + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h new file mode 100644 index 000000000000..280d06cae352 --- /dev/null +++ b/kernel/rcu/tiny_plugin.h @@ -0,0 +1,174 @@ +/* + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition + * Internal non-public definitions that provide either classic + * or preemptible semantics. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (c) 2010 Linaro + * + * Author: Paul E. McKenney + */ + +#include +#include +#include +#include + +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ + struct rcu_head **curtail; /* ->next pointer of last CB. */ + RCU_TRACE(long qlen); /* Number of pending CBs. */ + RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ + RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ + RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ + RCU_TRACE(const char *name); /* Name of RCU type. */ +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_sched_ctrlblk = { + .donetail = &rcu_sched_ctrlblk.rcucblist, + .curtail = &rcu_sched_ctrlblk.rcucblist, + RCU_TRACE(.name = "rcu_sched") +}; + +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .donetail = &rcu_bh_ctrlblk.rcucblist, + .curtail = &rcu_bh_ctrlblk.rcucblist, + RCU_TRACE(.name = "rcu_bh") +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#include + +int rcu_scheduler_active __read_mostly; +EXPORT_SYMBOL_GPL(rcu_scheduler_active); + +/* + * During boot, we forgive RCU lockdep issues. After this function is + * invoked, we start taking RCU lockdep issues seriously. + */ +void __init rcu_scheduler_starting(void) +{ + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} + +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_RCU_TRACE + +static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) +{ + unsigned long flags; + + local_irq_save(flags); + rcp->qlen -= n; + local_irq_restore(flags); +} + +/* + * Dump statistics for TINY_RCU, such as they are. + */ +static int show_tiny_stats(struct seq_file *m, void *unused) +{ + seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); + seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); + return 0; +} + +static int show_tiny_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_tiny_stats, NULL); +} + +static const struct file_operations show_tiny_stats_fops = { + .owner = THIS_MODULE, + .open = show_tiny_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rcudir; + +static int __init rcutiny_trace_init(void) +{ + struct dentry *retval; + + rcudir = debugfs_create_dir("rcu", NULL); + if (!rcudir) + goto free_out; + retval = debugfs_create_file("rcudata", 0444, rcudir, + NULL, &show_tiny_stats_fops); + if (!retval) + goto free_out; + return 0; +free_out: + debugfs_remove_recursive(rcudir); + return 1; +} + +static void __exit rcutiny_trace_cleanup(void) +{ + debugfs_remove_recursive(rcudir); +} + +module_init(rcutiny_trace_init); +module_exit(rcutiny_trace_cleanup); + +MODULE_AUTHOR("Paul E. McKenney"); +MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); +MODULE_LICENSE("GPL"); + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long j; + unsigned long js; + + if (rcu_cpu_stall_suppress) + return; + rcp->ticks_this_gp++; + j = jiffies; + js = rcp->jiffies_stall; + if (*rcp->curtail && ULONG_CMP_GE(j, js)) { + pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", + rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, + jiffies - rcp->gp_start, rcp->qlen); + dump_stack(); + } + if (*rcp->curtail && ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + + 3 * rcu_jiffies_till_stall_check() + 3; + else if (ULONG_CMP_GE(j, js)) + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) +{ + rcp->ticks_this_gp = 0; + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +} + +static void check_cpu_stalls(void) +{ + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c new file mode 100644 index 000000000000..3929cd451511 --- /dev/null +++ b/kernel/rcu/torture.c @@ -0,0 +1,2145 @@ +/* + * Read-Copy Update module-based torture test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005, 2006 + * + * Authors: Paul E. McKenney + * Josh Triplett + * + * See also: Documentation/RCU/torture.txt + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); + +MODULE_ALIAS("rcutorture"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcutorture." + +static int fqs_duration; +module_param(fqs_duration, int, 0444); +MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); +static int fqs_holdoff; +module_param(fqs_holdoff, int, 0444); +MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); +static int fqs_stutter = 3; +module_param(fqs_stutter, int, 0444); +MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +static bool gp_exp; +module_param(gp_exp, bool, 0444); +MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); +static bool gp_normal; +module_param(gp_normal, bool, 0444); +MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); +static int irqreader = 1; +module_param(irqreader, int, 0444); +MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +static int n_barrier_cbs; +module_param(n_barrier_cbs, int, 0444); +MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); +static int nfakewriters = 4; +module_param(nfakewriters, int, 0444); +MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); +static int nreaders = -1; +module_param(nreaders, int, 0444); +MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +static int object_debug; +module_param(object_debug, int, 0444); +MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); +static int onoff_holdoff; +module_param(onoff_holdoff, int, 0444); +MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); +static int onoff_interval; +module_param(onoff_interval, int, 0444); +MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +static int shuffle_interval = 3; +module_param(shuffle_interval, int, 0444); +MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); +static int shutdown_secs; +module_param(shutdown_secs, int, 0444); +MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); +static int stall_cpu; +module_param(stall_cpu, int, 0444); +MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); +static int stall_cpu_holdoff = 10; +module_param(stall_cpu_holdoff, int, 0444); +MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); +static int stat_interval = 60; +module_param(stat_interval, int, 0644); +MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); +static int stutter = 5; +module_param(stutter, int, 0444); +MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); +static int test_boost = 1; +module_param(test_boost, int, 0444); +MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); +static int test_boost_duration = 4; +module_param(test_boost_duration, int, 0444); +MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); +static int test_boost_interval = 7; +module_param(test_boost_interval, int, 0444); +MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); +static bool test_no_idle_hz = true; +module_param(test_no_idle_hz, bool, 0444); +MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); +static char *torture_type = "rcu"; +module_param(torture_type, charp, 0444); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); +static bool verbose; +module_param(verbose, bool, 0444); +MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); + +#define TORTURE_FLAG "-torture:" +#define PRINTK_STRING(s) \ + do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) +#define VERBOSE_PRINTK_STRING(s) \ + do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) +#define VERBOSE_PRINTK_ERRSTRING(s) \ + do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) + +static char printk_buf[4096]; + +static int nrealreaders; +static struct task_struct *writer_task; +static struct task_struct **fakewriter_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *stats_task; +static struct task_struct *shuffler_task; +static struct task_struct *stutter_task; +static struct task_struct *fqs_task; +static struct task_struct *boost_tasks[NR_CPUS]; +static struct task_struct *shutdown_task; +#ifdef CONFIG_HOTPLUG_CPU +static struct task_struct *onoff_task; +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static struct task_struct *stall_task; +static struct task_struct **barrier_cbs_tasks; +static struct task_struct *barrier_task; + +#define RCU_TORTURE_PIPE_LEN 10 + +struct rcu_torture { + struct rcu_head rtort_rcu; + int rtort_pipe_count; + struct list_head rtort_free; + int rtort_mbtest; +}; + +static LIST_HEAD(rcu_torture_freelist); +static struct rcu_torture __rcu *rcu_torture_current; +static unsigned long rcu_torture_current_version; +static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; +static DEFINE_SPINLOCK(rcu_torture_lock); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = + { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = + { 0 }; +static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; +static atomic_t n_rcu_torture_alloc; +static atomic_t n_rcu_torture_alloc_fail; +static atomic_t n_rcu_torture_free; +static atomic_t n_rcu_torture_mberror; +static atomic_t n_rcu_torture_error; +static long n_rcu_torture_barrier_error; +static long n_rcu_torture_boost_ktrerror; +static long n_rcu_torture_boost_rterror; +static long n_rcu_torture_boost_failure; +static long n_rcu_torture_boosts; +static long n_rcu_torture_timers; +static long n_offline_attempts; +static long n_offline_successes; +static unsigned long sum_offline; +static int min_offline = -1; +static int max_offline; +static long n_online_attempts; +static long n_online_successes; +static unsigned long sum_online; +static int min_online = -1; +static int max_online; +static long n_barrier_attempts; +static long n_barrier_successes; +static struct list_head rcu_torture_removed; +static cpumask_var_t shuffle_tmp_mask; + +static int stutter_pause_test; + +#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) +#define RCUTORTURE_RUNNABLE_INIT 1 +#else +#define RCUTORTURE_RUNNABLE_INIT 0 +#endif +int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; +module_param(rcutorture_runnable, int, 0444); +MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); + +#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) +#define rcu_can_boost() 1 +#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +#define rcu_can_boost() 0 +#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ + +#ifdef CONFIG_RCU_TRACE +static u64 notrace rcu_trace_clock_local(void) +{ + u64 ts = trace_clock_local(); + unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + return ts; +} +#else /* #ifdef CONFIG_RCU_TRACE */ +static u64 notrace rcu_trace_clock_local(void) +{ + return 0ULL; +} +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +static unsigned long shutdown_time; /* jiffies to system shutdown. */ +static unsigned long boost_starttime; /* jiffies of next boost test start. */ +DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ + /* and boost task create/destroy. */ +static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ +static bool barrier_phase; /* Test phase. */ +static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ +static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ +static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); + +/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ + +#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ +#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ +#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ +static int fullstop = FULLSTOP_RMMOD; +/* + * Protect fullstop transitions and spawning of kthreads. + */ +static DEFINE_MUTEX(fullstop_mutex); + +/* Forward reference. */ +static void rcu_torture_cleanup(void); + +/* + * Detect and respond to a system shutdown. + */ +static int +rcutorture_shutdown_notify(struct notifier_block *unused1, + unsigned long unused2, void *unused3) +{ + mutex_lock(&fullstop_mutex); + if (fullstop == FULLSTOP_DONTSTOP) + fullstop = FULLSTOP_SHUTDOWN; + else + pr_warn(/* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + return NOTIFY_DONE; +} + +/* + * Absorb kthreads into a kernel function that won't return, so that + * they won't ever access module text or data again. + */ +static void rcutorture_shutdown_absorb(const char *title) +{ + if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + pr_notice( + "rcutorture thread %s parking due to system shutdown\n", + title); + schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); + } +} + +/* + * Allocate an element from the rcu_tortures pool. + */ +static struct rcu_torture * +rcu_torture_alloc(void) +{ + struct list_head *p; + + spin_lock_bh(&rcu_torture_lock); + if (list_empty(&rcu_torture_freelist)) { + atomic_inc(&n_rcu_torture_alloc_fail); + spin_unlock_bh(&rcu_torture_lock); + return NULL; + } + atomic_inc(&n_rcu_torture_alloc); + p = rcu_torture_freelist.next; + list_del_init(p); + spin_unlock_bh(&rcu_torture_lock); + return container_of(p, struct rcu_torture, rtort_free); +} + +/* + * Free an element to the rcu_tortures pool. + */ +static void +rcu_torture_free(struct rcu_torture *p) +{ + atomic_inc(&n_rcu_torture_free); + spin_lock_bh(&rcu_torture_lock); + list_add_tail(&p->rtort_free, &rcu_torture_freelist); + spin_unlock_bh(&rcu_torture_lock); +} + +struct rcu_random_state { + unsigned long rrs_state; + long rrs_count; +}; + +#define RCU_RANDOM_MULT 39916801 /* prime */ +#define RCU_RANDOM_ADD 479001701 /* prime */ +#define RCU_RANDOM_REFRESH 10000 + +#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from cpu_clock(). + */ +static unsigned long +rcu_random(struct rcu_random_state *rrsp) +{ + if (--rrsp->rrs_count < 0) { + rrsp->rrs_state += (unsigned long)local_clock(); + rrsp->rrs_count = RCU_RANDOM_REFRESH; + } + rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; + return swahw32(rrsp->rrs_state); +} + +static void +rcu_stutter_wait(const char *title) +{ + while (stutter_pause_test || !rcutorture_runnable) { + if (rcutorture_runnable) + schedule_timeout_interruptible(1); + else + schedule_timeout_interruptible(round_jiffies_relative(HZ)); + rcutorture_shutdown_absorb(title); + } +} + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_torture_ops { + void (*init)(void); + int (*readlock)(void); + void (*read_delay)(struct rcu_random_state *rrsp); + void (*readunlock)(int idx); + int (*completed)(void); + void (*deferred_free)(struct rcu_torture *p); + void (*sync)(void); + void (*exp_sync)(void); + void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); + void (*cb_barrier)(void); + void (*fqs)(void); + int (*stats)(char *page); + int irq_capable; + int can_boost; + const char *name; +}; + +static struct rcu_torture_ops *cur_ops; + +/* + * Definitions for rcu torture testing. + */ + +static int rcu_torture_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_read_delay(struct rcu_random_state *rrsp) +{ + const unsigned long shortdelay_us = 200; + const unsigned long longdelay_ms = 50; + + /* We want a short delay sometimes to make a reader delay the grace + * period, and we want a long delay occasionally to trigger + * force_quiescent_state. */ + + if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) + preempt_schedule(); /* No QS if preempt_disable() in effect */ +#endif +} + +static void rcu_torture_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static int rcu_torture_completed(void) +{ + return rcu_batches_completed(); +} + +static void +rcu_torture_cb(struct rcu_head *p) +{ + int i; + struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); + + if (fullstop != FULLSTOP_DONTSTOP) { + /* Test is ending, just drop callbacks on the floor. */ + /* The next initialization will pick up the pieces. */ + return; + } + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + rcu_torture_free(rp); + } else { + cur_ops->deferred_free(rp); + } +} + +static int rcu_no_completed(void) +{ + return 0; +} + +static void rcu_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu(&p->rtort_rcu, rcu_torture_cb); +} + +static void rcu_sync_torture_init(void) +{ + INIT_LIST_HEAD(&rcu_torture_removed); +} + +static struct rcu_torture_ops rcu_ops = { + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferred_free = rcu_torture_deferred_free, + .sync = synchronize_rcu, + .exp_sync = synchronize_rcu_expedited, + .call = call_rcu, + .cb_barrier = rcu_barrier, + .fqs = rcu_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .can_boost = rcu_can_boost(), + .name = "rcu" +}; + +/* + * Definitions for rcu_bh torture testing. + */ + +static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) +{ + rcu_read_unlock_bh(); +} + +static int rcu_bh_torture_completed(void) +{ + return rcu_batches_completed_bh(); +} + +static void rcu_bh_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops rcu_bh_ops = { + .init = rcu_sync_torture_init, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_bh_torture_deferred_free, + .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, + .call = call_rcu_bh, + .cb_barrier = rcu_barrier_bh, + .fqs = rcu_bh_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh" +}; + +/* + * Definitions for srcu torture testing. + */ + +DEFINE_STATIC_SRCU(srcu_ctl); + +static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) +{ + return srcu_read_lock(&srcu_ctl); +} + +static void srcu_read_delay(struct rcu_random_state *rrsp) +{ + long delay; + const long uspertick = 1000000 / HZ; + const long longdelay = 10; + + /* We want there to be long-running readers, but not all the time. */ + + delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); + if (!delay) + schedule_timeout_interruptible(longdelay); + else + rcu_read_delay(rrsp); +} + +static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) +{ + srcu_read_unlock(&srcu_ctl, idx); +} + +static int srcu_torture_completed(void) +{ + return srcu_batches_completed(&srcu_ctl); +} + +static void srcu_torture_deferred_free(struct rcu_torture *rp) +{ + call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); +} + +static void srcu_torture_synchronize(void) +{ + synchronize_srcu(&srcu_ctl); +} + +static void srcu_torture_call(struct rcu_head *head, + void (*func)(struct rcu_head *head)) +{ + call_srcu(&srcu_ctl, head, func); +} + +static void srcu_torture_barrier(void) +{ + srcu_barrier(&srcu_ctl); +} + +static int srcu_torture_stats(char *page) +{ + int cnt = 0; + int cpu; + int idx = srcu_ctl.completed & 0x1; + + cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", + torture_type, TORTURE_FLAG, idx); + for_each_possible_cpu(cpu) { + cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} + +static void srcu_torture_synchronize_expedited(void) +{ + synchronize_srcu_expedited(&srcu_ctl); +} + +static struct rcu_torture_ops srcu_ops = { + .init = rcu_sync_torture_init, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = srcu_torture_deferred_free, + .sync = srcu_torture_synchronize, + .exp_sync = srcu_torture_synchronize_expedited, + .call = srcu_torture_call, + .cb_barrier = srcu_torture_barrier, + .stats = srcu_torture_stats, + .name = "srcu" +}; + +/* + * Definitions for sched torture testing. + */ + +static int sched_torture_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_torture_read_unlock(int idx) +{ + preempt_enable(); +} + +static void rcu_sched_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops sched_ops = { + .init = rcu_sync_torture_init, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_sched_torture_deferred_free, + .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .call = call_rcu_sched, + .cb_barrier = rcu_barrier_sched, + .fqs = rcu_sched_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "sched" +}; + +/* + * RCU torture priority-boost testing. Runs one real-time thread per + * CPU for moderate bursts, repeatedly registering RCU callbacks and + * spinning waiting for them to be invoked. If a given callback takes + * too long to be invoked, we assume that priority inversion has occurred. + */ + +struct rcu_boost_inflight { + struct rcu_head rcu; + int inflight; +}; + +static void rcu_torture_boost_cb(struct rcu_head *head) +{ + struct rcu_boost_inflight *rbip = + container_of(head, struct rcu_boost_inflight, rcu); + + smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ + rbip->inflight = 0; +} + +static int rcu_torture_boost(void *arg) +{ + unsigned long call_rcu_time; + unsigned long endtime; + unsigned long oldstarttime; + struct rcu_boost_inflight rbi = { .inflight = 0 }; + struct sched_param sp; + + VERBOSE_PRINTK_STRING("rcu_torture_boost started"); + + /* Set real-time priority. */ + sp.sched_priority = 1; + if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { + VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); + n_rcu_torture_boost_rterror++; + } + + init_rcu_head_on_stack(&rbi.rcu); + /* Each pass through the following loop does one boost-test cycle. */ + do { + /* Wait for the next test interval. */ + oldstarttime = boost_starttime; + while (ULONG_CMP_LT(jiffies, oldstarttime)) { + schedule_timeout_interruptible(oldstarttime - jiffies); + rcu_stutter_wait("rcu_torture_boost"); + if (kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP) + goto checkwait; + } + + /* Do one boost-test interval. */ + endtime = oldstarttime + test_boost_duration * HZ; + call_rcu_time = jiffies; + while (ULONG_CMP_LT(jiffies, endtime)) { + /* If we don't have a callback in flight, post one. */ + if (!rbi.inflight) { + smp_mb(); /* RCU core before ->inflight = 1. */ + rbi.inflight = 1; + call_rcu(&rbi.rcu, rcu_torture_boost_cb); + if (jiffies - call_rcu_time > + test_boost_duration * HZ - HZ / 2) { + VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); + n_rcu_torture_boost_failure++; + } + call_rcu_time = jiffies; + } + cond_resched(); + rcu_stutter_wait("rcu_torture_boost"); + if (kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP) + goto checkwait; + } + + /* + * Set the start time of the next test interval. + * Yes, this is vulnerable to long delays, but such + * delays simply cause a false negative for the next + * interval. Besides, we are running at RT priority, + * so delays should be relatively rare. + */ + while (oldstarttime == boost_starttime && + !kthread_should_stop()) { + if (mutex_trylock(&boost_mutex)) { + boost_starttime = jiffies + + test_boost_interval * HZ; + n_rcu_torture_boosts++; + mutex_unlock(&boost_mutex); + break; + } + schedule_timeout_uninterruptible(1); + } + + /* Go do the stutter. */ +checkwait: rcu_stutter_wait("rcu_torture_boost"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + + /* Clean up and exit. */ + VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); + rcutorture_shutdown_absorb("rcu_torture_boost"); + while (!kthread_should_stop() || rbi.inflight) + schedule_timeout_uninterruptible(1); + smp_mb(); /* order accesses to ->inflight before stack-frame death. */ + destroy_rcu_head_on_stack(&rbi.rcu); + return 0; +} + +/* + * RCU torture force-quiescent-state kthread. Repeatedly induces + * bursts of calls to force_quiescent_state(), increasing the probability + * of occurrence of some important types of race conditions. + */ +static int +rcu_torture_fqs(void *arg) +{ + unsigned long fqs_resume_time; + int fqs_burst_remaining; + + VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); + do { + fqs_resume_time = jiffies + fqs_stutter * HZ; + while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + !kthread_should_stop()) { + schedule_timeout_interruptible(1); + } + fqs_burst_remaining = fqs_duration; + while (fqs_burst_remaining > 0 && + !kthread_should_stop()) { + cur_ops->fqs(); + udelay(fqs_holdoff); + fqs_burst_remaining -= fqs_holdoff; + } + rcu_stutter_wait("rcu_torture_fqs"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_fqs"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * RCU torture writer kthread. Repeatedly substitutes a new structure + * for that pointed to by rcu_torture_current, freeing the old structure + * after a series of grace periods (the "pipeline"). + */ +static int +rcu_torture_writer(void *arg) +{ + bool exp; + int i; + struct rcu_torture *rp; + struct rcu_torture *rp1; + struct rcu_torture *old_rp; + static DEFINE_RCU_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1); + rp = rcu_torture_alloc(); + if (rp == NULL) + continue; + rp->rtort_pipe_count = 0; + udelay(rcu_random(&rand) & 0x3ff); + old_rp = rcu_dereference_check(rcu_torture_current, + current == writer_task); + rp->rtort_mbtest = 1; + rcu_assign_pointer(rcu_torture_current, rp); + smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ + if (old_rp) { + i = old_rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + old_rp->rtort_pipe_count++; + if (gp_normal == gp_exp) + exp = !!(rcu_random(&rand) & 0x80); + else + exp = gp_exp; + if (!exp) { + cur_ops->deferred_free(old_rp); + } else { + cur_ops->exp_sync(); + list_add(&old_rp->rtort_free, + &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, + &rcu_torture_removed, + rtort_free) { + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= + RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } + } + } + rcutorture_record_progress(++rcu_torture_current_version); + rcu_stutter_wait("rcu_torture_writer"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); + rcutorture_shutdown_absorb("rcu_torture_writer"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * RCU torture fake writer kthread. Repeatedly calls sync, with a random + * delay between calls. + */ +static int +rcu_torture_fakewriter(void *arg) +{ + DEFINE_RCU_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); + udelay(rcu_random(&rand) & 0x3ff); + if (cur_ops->cb_barrier != NULL && + rcu_random(&rand) % (nfakewriters * 8) == 0) { + cur_ops->cb_barrier(); + } else if (gp_normal == gp_exp) { + if (rcu_random(&rand) & 0x80) + cur_ops->sync(); + else + cur_ops->exp_sync(); + } else if (gp_normal) { + cur_ops->sync(); + } else { + cur_ops->exp_sync(); + } + rcu_stutter_wait("rcu_torture_fakewriter"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); + rcutorture_shutdown_absorb("rcu_torture_fakewriter"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +void rcutorture_trace_dump(void) +{ + static atomic_t beenhere = ATOMIC_INIT(0); + + if (atomic_read(&beenhere)) + return; + if (atomic_xchg(&beenhere, 1) != 0) + return; + ftrace_dump(DUMP_ALL); +} + +/* + * RCU torture reader from timer handler. Dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static void rcu_torture_timer(unsigned long unused) +{ + int idx; + int completed; + int completed_end; + static DEFINE_RCU_RANDOM(rand); + static DEFINE_SPINLOCK(rand_lock); + struct rcu_torture *p; + int pipe_count; + unsigned long long ts; + + idx = cur_ops->readlock(); + completed = cur_ops->completed(); + ts = rcu_trace_clock_local(); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(&srcu_ctl)); + if (p == NULL) { + /* Leave because rcu_torture_writer is not yet underway */ + cur_ops->readunlock(idx); + return; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + spin_lock(&rand_lock); + cur_ops->read_delay(&rand); + n_rcu_torture_timers++; + spin_unlock(&rand_lock); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, + completed, completed_end); + rcutorture_trace_dump(); + } + __this_cpu_inc(rcu_torture_count[pipe_count]); + completed = completed_end - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_batch[completed]); + preempt_enable(); + cur_ops->readunlock(idx); +} + +/* + * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static int +rcu_torture_reader(void *arg) +{ + int completed; + int completed_end; + int idx; + DEFINE_RCU_RANDOM(rand); + struct rcu_torture *p; + int pipe_count; + struct timer_list t; + unsigned long long ts; + + VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); + set_user_nice(current, 19); + if (irqreader && cur_ops->irq_capable) + setup_timer_on_stack(&t, rcu_torture_timer, 0); + + do { + if (irqreader && cur_ops->irq_capable) { + if (!timer_pending(&t)) + mod_timer(&t, jiffies + 1); + } + idx = cur_ops->readlock(); + completed = cur_ops->completed(); + ts = rcu_trace_clock_local(); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(&srcu_ctl)); + if (p == NULL) { + /* Wait for rcu_torture_writer to get underway */ + cur_ops->readunlock(idx); + schedule_timeout_interruptible(HZ); + continue; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + cur_ops->read_delay(&rand); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + completed_end = cur_ops->completed(); + if (pipe_count > 1) { + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, + ts, completed, completed_end); + rcutorture_trace_dump(); + } + __this_cpu_inc(rcu_torture_count[pipe_count]); + completed = completed_end - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_batch[completed]); + preempt_enable(); + cur_ops->readunlock(idx); + schedule(); + rcu_stutter_wait("rcu_torture_reader"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + rcutorture_shutdown_absorb("rcu_torture_reader"); + if (irqreader && cur_ops->irq_capable) + del_timer_sync(&t); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * Create an RCU-torture statistics message in the specified buffer. + */ +static int +rcu_torture_printk(char *page) +{ + int cnt = 0; + int cpu; + int i; + long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + + for_each_possible_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; + batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + } + } + for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { + if (pipesummary[i] != 0) + break; + } + cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); + cnt += sprintf(&page[cnt], + "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + rcu_torture_current, + rcu_torture_current_version, + list_empty(&rcu_torture_freelist), + atomic_read(&n_rcu_torture_alloc), + atomic_read(&n_rcu_torture_alloc_fail), + atomic_read(&n_rcu_torture_free)); + cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", + atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_boost_ktrerror, + n_rcu_torture_boost_rterror); + cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", + n_rcu_torture_boost_failure, + n_rcu_torture_boosts, + n_rcu_torture_timers); + cnt += sprintf(&page[cnt], + "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", + n_online_successes, n_online_attempts, + n_offline_successes, n_offline_attempts, + min_online, max_online, + min_offline, max_offline, + sum_online, sum_offline, HZ); + cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", + n_barrier_successes, + n_barrier_attempts, + n_rcu_torture_barrier_error); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); + if (atomic_read(&n_rcu_torture_mberror) != 0 || + n_rcu_torture_barrier_error != 0 || + n_rcu_torture_boost_ktrerror != 0 || + n_rcu_torture_boost_rterror != 0 || + n_rcu_torture_boost_failure != 0 || + i > 1) { + cnt += sprintf(&page[cnt], "!!! "); + atomic_inc(&n_rcu_torture_error); + WARN_ON_ONCE(1); + } + cnt += sprintf(&page[cnt], "Reader Pipe: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); + cnt += sprintf(&page[cnt], "Reader Batch: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); + cnt += sprintf(&page[cnt], "Free-Block Circulation: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + cnt += sprintf(&page[cnt], " %d", + atomic_read(&rcu_torture_wcount[i])); + } + cnt += sprintf(&page[cnt], "\n"); + if (cur_ops->stats) + cnt += cur_ops->stats(&page[cnt]); + return cnt; +} + +/* + * Print torture statistics. Caller must ensure that there is only + * one call to this function at a given time!!! This is normally + * accomplished by relying on the module system to only have one copy + * of the module loaded, and then by giving the rcu_torture_stats + * kthread full control (or the init/cleanup functions when rcu_torture_stats + * thread is not running). + */ +static void +rcu_torture_stats_print(void) +{ + int cnt; + + cnt = rcu_torture_printk(printk_buf); + pr_alert("%s", printk_buf); +} + +/* + * Periodically prints torture statistics, if periodic statistics printing + * was specified via the stat_interval module parameter. + * + * No need to worry about fullstop here, since this one doesn't reference + * volatile state or register callbacks. + */ +static int +rcu_torture_stats(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + rcu_torture_stats_print(); + rcutorture_shutdown_absorb("rcu_torture_stats"); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); + return 0; +} + +static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ + +/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case + * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. + */ +static void rcu_torture_shuffle_tasks(void) +{ + int i; + + cpumask_setall(shuffle_tmp_mask); + get_online_cpus(); + + /* No point in shuffling if there is only one online CPU (ex: UP) */ + if (num_online_cpus() == 1) { + put_online_cpus(); + return; + } + + if (rcu_idle_cpu != -1) + cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); + + set_cpus_allowed_ptr(current, shuffle_tmp_mask); + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + if (reader_tasks[i]) + set_cpus_allowed_ptr(reader_tasks[i], + shuffle_tmp_mask); + } + if (fakewriter_tasks) { + for (i = 0; i < nfakewriters; i++) + if (fakewriter_tasks[i]) + set_cpus_allowed_ptr(fakewriter_tasks[i], + shuffle_tmp_mask); + } + if (writer_task) + set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); + if (stats_task) + set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); + if (stutter_task) + set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); + if (fqs_task) + set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); + if (shutdown_task) + set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); +#ifdef CONFIG_HOTPLUG_CPU + if (onoff_task) + set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + if (stall_task) + set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); + if (barrier_cbs_tasks) + for (i = 0; i < n_barrier_cbs; i++) + if (barrier_cbs_tasks[i]) + set_cpus_allowed_ptr(barrier_cbs_tasks[i], + shuffle_tmp_mask); + if (barrier_task) + set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); + + if (rcu_idle_cpu == -1) + rcu_idle_cpu = num_online_cpus() - 1; + else + rcu_idle_cpu--; + + put_online_cpus(); +} + +/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the + * system to become idle at a time and cut off its timer ticks. This is meant + * to test the support for such tickless idle CPU in RCU. + */ +static int +rcu_torture_shuffle(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); + do { + schedule_timeout_interruptible(shuffle_interval * HZ); + rcu_torture_shuffle_tasks(); + rcutorture_shutdown_absorb("rcu_torture_shuffle"); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); + return 0; +} + +/* Cause the rcutorture test to "stutter", starting and stopping all + * threads periodically. + */ +static int +rcu_torture_stutter(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); + do { + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 1; + if (!kthread_should_stop()) + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 0; + rcutorture_shutdown_absorb("rcu_torture_stutter"); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); + return 0; +} + +static inline void +rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) +{ + pr_alert("%s" TORTURE_FLAG + "--- %s: nreaders=%d nfakewriters=%d " + "stat_interval=%d verbose=%d test_no_idle_hz=%d " + "shuffle_interval=%d stutter=%d irqreader=%d " + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " + "test_boost=%d/%d test_boost_interval=%d " + "test_boost_duration=%d shutdown_secs=%d " + "stall_cpu=%d stall_cpu_holdoff=%d " + "n_barrier_cbs=%d " + "onoff_interval=%d onoff_holdoff=%d\n", + torture_type, tag, nrealreaders, nfakewriters, + stat_interval, verbose, test_no_idle_hz, shuffle_interval, + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, + test_boost, cur_ops->can_boost, + test_boost_interval, test_boost_duration, shutdown_secs, + stall_cpu, stall_cpu_holdoff, + n_barrier_cbs, + onoff_interval, onoff_holdoff); +} + +static struct notifier_block rcutorture_shutdown_nb = { + .notifier_call = rcutorture_shutdown_notify, +}; + +static void rcutorture_booster_cleanup(int cpu) +{ + struct task_struct *t; + + if (boost_tasks[cpu] == NULL) + return; + mutex_lock(&boost_mutex); + VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); + t = boost_tasks[cpu]; + boost_tasks[cpu] = NULL; + mutex_unlock(&boost_mutex); + + /* This must be outside of the mutex, otherwise deadlock! */ + kthread_stop(t); + boost_tasks[cpu] = NULL; +} + +static int rcutorture_booster_init(int cpu) +{ + int retval; + + if (boost_tasks[cpu] != NULL) + return 0; /* Already created, nothing more to do. */ + + /* Don't allow time recalculation while creating a new task. */ + mutex_lock(&boost_mutex); + VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); + boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, + cpu_to_node(cpu), + "rcu_torture_boost"); + if (IS_ERR(boost_tasks[cpu])) { + retval = PTR_ERR(boost_tasks[cpu]); + VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); + n_rcu_torture_boost_ktrerror++; + boost_tasks[cpu] = NULL; + mutex_unlock(&boost_mutex); + return retval; + } + kthread_bind(boost_tasks[cpu], cpu); + wake_up_process(boost_tasks[cpu]); + mutex_unlock(&boost_mutex); + return 0; +} + +/* + * Cause the rcutorture test to shutdown the system after the test has + * run for the time specified by the shutdown_secs module parameter. + */ +static int +rcu_torture_shutdown(void *arg) +{ + long delta; + unsigned long jiffies_snap; + + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); + jiffies_snap = ACCESS_ONCE(jiffies); + while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && + !kthread_should_stop()) { + delta = shutdown_time - jiffies_snap; + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_shutdown task: %lu jiffies remaining\n", + torture_type, delta); + schedule_timeout_interruptible(delta); + jiffies_snap = ACCESS_ONCE(jiffies); + } + if (kthread_should_stop()) { + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); + return 0; + } + + /* OK, shut down the system. */ + + VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); + shutdown_task = NULL; /* Avoid self-kill deadlock. */ + rcu_torture_cleanup(); /* Get the success/failure message. */ + kernel_power_off(); /* Shut down the system. */ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Execute random CPU-hotplug operations at the interval specified + * by the onoff_interval. + */ +static int +rcu_torture_onoff(void *arg) +{ + int cpu; + unsigned long delta; + int maxcpu = -1; + DEFINE_RCU_RANDOM(rand); + int ret; + unsigned long starttime; + + VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); + for_each_online_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + if (onoff_holdoff > 0) { + VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); + schedule_timeout_interruptible(onoff_holdoff * HZ); + VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); + } + while (!kthread_should_stop()) { + cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); + if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_offline_attempts++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: offlined %d\n", + torture_type, cpu); + n_offline_successes++; + delta = jiffies - starttime; + sum_offline += delta; + if (min_offline < 0) { + min_offline = delta; + max_offline = delta; + } + if (min_offline > delta) + min_offline = delta; + if (max_offline < delta) + max_offline = delta; + } + } else if (cpu_is_hotpluggable(cpu)) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + n_online_attempts++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: onlined %d\n", + torture_type, cpu); + n_online_successes++; + delta = jiffies - starttime; + sum_online += delta; + if (min_online < 0) { + min_online = delta; + max_online = delta; + } + if (min_online > delta) + min_online = delta; + if (max_online < delta) + max_online = delta; + } + } + schedule_timeout_interruptible(onoff_interval * HZ); + } + VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); + return 0; +} + +static int +rcu_torture_onoff_init(void) +{ + int ret; + + if (onoff_interval <= 0) + return 0; + onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); + if (IS_ERR(onoff_task)) { + ret = PTR_ERR(onoff_task); + onoff_task = NULL; + return ret; + } + return 0; +} + +static void rcu_torture_onoff_cleanup(void) +{ + if (onoff_task == NULL) + return; + VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); + kthread_stop(onoff_task); + onoff_task = NULL; +} + +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + +static int +rcu_torture_onoff_init(void) +{ + return 0; +} + +static void rcu_torture_onoff_cleanup(void) +{ +} + +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then + * induces a CPU stall for the time specified by stall_cpu. + */ +static int rcu_torture_stall(void *args) +{ + unsigned long stop_at; + + VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); + if (stall_cpu_holdoff > 0) { + VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); + schedule_timeout_interruptible(stall_cpu_holdoff * HZ); + VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); + } + if (!kthread_should_stop()) { + stop_at = get_seconds() + stall_cpu; + /* RCU CPU stall is expected behavior in following code. */ + pr_alert("rcu_torture_stall start.\n"); + rcu_read_lock(); + preempt_disable(); + while (ULONG_CMP_LT(get_seconds(), stop_at)) + continue; /* Induce RCU CPU stall warning. */ + preempt_enable(); + rcu_read_unlock(); + pr_alert("rcu_torture_stall end.\n"); + } + rcutorture_shutdown_absorb("rcu_torture_stall"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(10 * HZ); + return 0; +} + +/* Spawn CPU-stall kthread, if stall_cpu specified. */ +static int __init rcu_torture_stall_init(void) +{ + int ret; + + if (stall_cpu <= 0) + return 0; + stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); + if (IS_ERR(stall_task)) { + ret = PTR_ERR(stall_task); + stall_task = NULL; + return ret; + } + return 0; +} + +/* Clean up after the CPU-stall kthread, if one was spawned. */ +static void rcu_torture_stall_cleanup(void) +{ + if (stall_task == NULL) + return; + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); + kthread_stop(stall_task); + stall_task = NULL; +} + +/* Callback function for RCU barrier testing. */ +void rcu_torture_barrier_cbf(struct rcu_head *rcu) +{ + atomic_inc(&barrier_cbs_invoked); +} + +/* kthread function to register callbacks used to test RCU barriers. */ +static int rcu_torture_barrier_cbs(void *arg) +{ + long myid = (long)arg; + bool lastphase = 0; + struct rcu_head rcu; + + init_rcu_head_on_stack(&rcu); + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); + set_user_nice(current, 19); + do { + wait_event(barrier_cbs_wq[myid], + barrier_phase != lastphase || + kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP); + lastphase = barrier_phase; + smp_mb(); /* ensure barrier_phase load before ->call(). */ + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + break; + cur_ops->call(&rcu, rcu_torture_barrier_cbf); + if (atomic_dec_and_test(&barrier_cbs_count)) + wake_up(&barrier_wq); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(1); + cur_ops->cb_barrier(); + destroy_rcu_head_on_stack(&rcu); + return 0; +} + +/* kthread function to drive and coordinate RCU barrier testing. */ +static int rcu_torture_barrier(void *arg) +{ + int i; + + VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); + do { + atomic_set(&barrier_cbs_invoked, 0); + atomic_set(&barrier_cbs_count, n_barrier_cbs); + smp_mb(); /* Ensure barrier_phase after prior assignments. */ + barrier_phase = !barrier_phase; + for (i = 0; i < n_barrier_cbs; i++) + wake_up(&barrier_cbs_wq[i]); + wait_event(barrier_wq, + atomic_read(&barrier_cbs_count) == 0 || + kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP); + if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) + break; + n_barrier_attempts++; + cur_ops->cb_barrier(); + if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { + n_rcu_torture_barrier_error++; + WARN_ON_ONCE(1); + } + n_barrier_successes++; + schedule_timeout_interruptible(HZ / 10); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); + rcutorture_shutdown_absorb("rcu_torture_barrier"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(1); + return 0; +} + +/* Initialize RCU barrier testing. */ +static int rcu_torture_barrier_init(void) +{ + int i; + int ret; + + if (n_barrier_cbs == 0) + return 0; + if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { + pr_alert("%s" TORTURE_FLAG + " Call or barrier ops missing for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " RCU barrier testing omitted from run.\n", + torture_type); + return 0; + } + atomic_set(&barrier_cbs_count, 0); + atomic_set(&barrier_cbs_invoked, 0); + barrier_cbs_tasks = + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), + GFP_KERNEL); + barrier_cbs_wq = + kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), + GFP_KERNEL); + if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) + return -ENOMEM; + for (i = 0; i < n_barrier_cbs; i++) { + init_waitqueue_head(&barrier_cbs_wq[i]); + barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, + (void *)(long)i, + "rcu_torture_barrier_cbs"); + if (IS_ERR(barrier_cbs_tasks[i])) { + ret = PTR_ERR(barrier_cbs_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); + barrier_cbs_tasks[i] = NULL; + return ret; + } + } + barrier_task = kthread_run(rcu_torture_barrier, NULL, + "rcu_torture_barrier"); + if (IS_ERR(barrier_task)) { + ret = PTR_ERR(barrier_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); + barrier_task = NULL; + } + return 0; +} + +/* Clean up after RCU barrier testing. */ +static void rcu_torture_barrier_cleanup(void) +{ + int i; + + if (barrier_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); + kthread_stop(barrier_task); + barrier_task = NULL; + } + if (barrier_cbs_tasks != NULL) { + for (i = 0; i < n_barrier_cbs; i++) { + if (barrier_cbs_tasks[i] != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); + kthread_stop(barrier_cbs_tasks[i]); + barrier_cbs_tasks[i] = NULL; + } + } + kfree(barrier_cbs_tasks); + barrier_cbs_tasks = NULL; + } + if (barrier_cbs_wq != NULL) { + kfree(barrier_cbs_wq); + barrier_cbs_wq = NULL; + } +} + +static int rcutorture_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + (void)rcutorture_booster_init(cpu); + break; + case CPU_DOWN_PREPARE: + rcutorture_booster_cleanup(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block rcutorture_cpu_nb = { + .notifier_call = rcutorture_cpu_notify, +}; + +static void +rcu_torture_cleanup(void) +{ + int i; + + mutex_lock(&fullstop_mutex); + rcutorture_record_test_transition(); + if (fullstop == FULLSTOP_SHUTDOWN) { + pr_warn(/* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + schedule_timeout_uninterruptible(10); + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); + return; + } + fullstop = FULLSTOP_RMMOD; + mutex_unlock(&fullstop_mutex); + unregister_reboot_notifier(&rcutorture_shutdown_nb); + rcu_torture_barrier_cleanup(); + rcu_torture_stall_cleanup(); + if (stutter_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); + kthread_stop(stutter_task); + } + stutter_task = NULL; + if (shuffler_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); + kthread_stop(shuffler_task); + free_cpumask_var(shuffle_tmp_mask); + } + shuffler_task = NULL; + + if (writer_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); + kthread_stop(writer_task); + } + writer_task = NULL; + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) { + if (reader_tasks[i]) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_reader task"); + kthread_stop(reader_tasks[i]); + } + reader_tasks[i] = NULL; + } + kfree(reader_tasks); + reader_tasks = NULL; + } + rcu_torture_current = NULL; + + if (fakewriter_tasks) { + for (i = 0; i < nfakewriters; i++) { + if (fakewriter_tasks[i]) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_fakewriter task"); + kthread_stop(fakewriter_tasks[i]); + } + fakewriter_tasks[i] = NULL; + } + kfree(fakewriter_tasks); + fakewriter_tasks = NULL; + } + + if (stats_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); + kthread_stop(stats_task); + } + stats_task = NULL; + + if (fqs_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); + kthread_stop(fqs_task); + } + fqs_task = NULL; + if ((test_boost == 1 && cur_ops->can_boost) || + test_boost == 2) { + unregister_cpu_notifier(&rcutorture_cpu_nb); + for_each_possible_cpu(i) + rcutorture_booster_cleanup(i); + } + if (shutdown_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); + kthread_stop(shutdown_task); + } + shutdown_task = NULL; + rcu_torture_onoff_cleanup(); + + /* Wait for all RCU callbacks to fire. */ + + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); + + rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + + if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) + rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); + else if (n_online_successes != n_online_attempts || + n_offline_successes != n_offline_attempts) + rcu_torture_print_module_parms(cur_ops, + "End of test: RCU_HOTPLUG"); + else + rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); +} + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static void rcu_torture_leak_cb(struct rcu_head *rhp) +{ +} + +static void rcu_torture_err_cb(struct rcu_head *rhp) +{ + /* + * This -might- happen due to race conditions, but is unlikely. + * The scenario that leads to this happening is that the + * first of the pair of duplicate callbacks is queued, + * someone else starts a grace period that includes that + * callback, then the second of the pair must wait for the + * next grace period. Unlikely, but can happen. If it + * does happen, the debug-objects subsystem won't have splatted. + */ + pr_alert("rcutorture: duplicated callback was invoked.\n"); +} +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +/* + * Verify that double-free causes debug-objects to complain, but only + * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test + * cannot be carried out. + */ +static void rcu_test_debug_objects(void) +{ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + struct rcu_head rh1; + struct rcu_head rh2; + + init_rcu_head_on_stack(&rh1); + init_rcu_head_on_stack(&rh2); + pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); + + /* Try to queue the rh2 pair of callbacks for the same grace period. */ + preempt_disable(); /* Prevent preemption from interrupting test. */ + rcu_read_lock(); /* Make it impossible to finish a grace period. */ + call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + local_irq_disable(); /* Make it harder to start a new grace period. */ + call_rcu(&rh2, rcu_torture_leak_cb); + call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + local_irq_enable(); + rcu_read_unlock(); + preempt_enable(); + + /* Wait for them all to get done so we can safely return. */ + rcu_barrier(); + pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); + destroy_rcu_head_on_stack(&rh1); + destroy_rcu_head_on_stack(&rh2); +#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); +#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +} + +static int __init +rcu_torture_init(void) +{ + int i; + int cpu; + int firsterr = 0; + int retval; + static struct rcu_torture_ops *torture_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + }; + + mutex_lock(&fullstop_mutex); + + /* Process args and tell the world that the torturer is on the job. */ + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { + cur_ops = torture_ops[i]; + if (strcmp(torture_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(torture_ops)) { + pr_alert("rcu-torture: invalid torture type: \"%s\"\n", + torture_type); + pr_alert("rcu-torture types:"); + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) + pr_alert(" %s", torture_ops[i]->name); + pr_alert("\n"); + mutex_unlock(&fullstop_mutex); + return -EINVAL; + } + if (cur_ops->fqs == NULL && fqs_duration != 0) { + pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); + fqs_duration = 0; + } + if (cur_ops->init) + cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + + if (nreaders >= 0) + nrealreaders = nreaders; + else + nrealreaders = 2 * num_online_cpus(); + rcu_torture_print_module_parms(cur_ops, "Start of test"); + fullstop = FULLSTOP_DONTSTOP; + + /* Set up the freelist. */ + + INIT_LIST_HEAD(&rcu_torture_freelist); + for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { + rcu_tortures[i].rtort_mbtest = 0; + list_add_tail(&rcu_tortures[i].rtort_free, + &rcu_torture_freelist); + } + + /* Initialize the statistics so that each run gets its own numbers. */ + + rcu_torture_current = NULL; + rcu_torture_current_version = 0; + atomic_set(&n_rcu_torture_alloc, 0); + atomic_set(&n_rcu_torture_alloc_fail, 0); + atomic_set(&n_rcu_torture_free, 0); + atomic_set(&n_rcu_torture_mberror, 0); + atomic_set(&n_rcu_torture_error, 0); + n_rcu_torture_barrier_error = 0; + n_rcu_torture_boost_ktrerror = 0; + n_rcu_torture_boost_rterror = 0; + n_rcu_torture_boost_failure = 0; + n_rcu_torture_boosts = 0; + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + atomic_set(&rcu_torture_wcount[i], 0); + for_each_possible_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + per_cpu(rcu_torture_count, cpu)[i] = 0; + per_cpu(rcu_torture_batch, cpu)[i] = 0; + } + } + + /* Start up the kthreads. */ + + VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); + writer_task = kthread_create(rcu_torture_writer, NULL, + "rcu_torture_writer"); + if (IS_ERR(writer_task)) { + firsterr = PTR_ERR(writer_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); + writer_task = NULL; + goto unwind; + } + wake_up_process(writer_task); + fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), + GFP_KERNEL); + if (fakewriter_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nfakewriters; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); + fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, + "rcu_torture_fakewriter"); + if (IS_ERR(fakewriter_tasks[i])) { + firsterr = PTR_ERR(fakewriter_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); + fakewriter_tasks[i] = NULL; + goto unwind; + } + } + reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); + reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, + "rcu_torture_reader"); + if (IS_ERR(reader_tasks[i])) { + firsterr = PTR_ERR(reader_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); + reader_tasks[i] = NULL; + goto unwind; + } + } + if (stat_interval > 0) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); + stats_task = kthread_run(rcu_torture_stats, NULL, + "rcu_torture_stats"); + if (IS_ERR(stats_task)) { + firsterr = PTR_ERR(stats_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); + stats_task = NULL; + goto unwind; + } + } + if (test_no_idle_hz) { + rcu_idle_cpu = num_online_cpus() - 1; + + if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { + firsterr = -ENOMEM; + VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); + goto unwind; + } + + /* Create the shuffler thread */ + shuffler_task = kthread_run(rcu_torture_shuffle, NULL, + "rcu_torture_shuffle"); + if (IS_ERR(shuffler_task)) { + free_cpumask_var(shuffle_tmp_mask); + firsterr = PTR_ERR(shuffler_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); + shuffler_task = NULL; + goto unwind; + } + } + if (stutter < 0) + stutter = 0; + if (stutter) { + /* Create the stutter thread */ + stutter_task = kthread_run(rcu_torture_stutter, NULL, + "rcu_torture_stutter"); + if (IS_ERR(stutter_task)) { + firsterr = PTR_ERR(stutter_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); + stutter_task = NULL; + goto unwind; + } + } + if (fqs_duration < 0) + fqs_duration = 0; + if (fqs_duration) { + /* Create the stutter thread */ + fqs_task = kthread_run(rcu_torture_fqs, NULL, + "rcu_torture_fqs"); + if (IS_ERR(fqs_task)) { + firsterr = PTR_ERR(fqs_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); + fqs_task = NULL; + goto unwind; + } + } + if (test_boost_interval < 1) + test_boost_interval = 1; + if (test_boost_duration < 2) + test_boost_duration = 2; + if ((test_boost == 1 && cur_ops->can_boost) || + test_boost == 2) { + + boost_starttime = jiffies + test_boost_interval * HZ; + register_cpu_notifier(&rcutorture_cpu_nb); + for_each_possible_cpu(i) { + if (cpu_is_offline(i)) + continue; /* Heuristic: CPU can go offline. */ + retval = rcutorture_booster_init(i); + if (retval < 0) { + firsterr = retval; + goto unwind; + } + } + } + if (shutdown_secs > 0) { + shutdown_time = jiffies + shutdown_secs * HZ; + shutdown_task = kthread_create(rcu_torture_shutdown, NULL, + "rcu_torture_shutdown"); + if (IS_ERR(shutdown_task)) { + firsterr = PTR_ERR(shutdown_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); + shutdown_task = NULL; + goto unwind; + } + wake_up_process(shutdown_task); + } + i = rcu_torture_onoff_init(); + if (i != 0) { + firsterr = i; + goto unwind; + } + register_reboot_notifier(&rcutorture_shutdown_nb); + i = rcu_torture_stall_init(); + if (i != 0) { + firsterr = i; + goto unwind; + } + retval = rcu_torture_barrier_init(); + if (retval != 0) { + firsterr = retval; + goto unwind; + } + if (object_debug) + rcu_test_debug_objects(); + rcutorture_record_test_transition(); + mutex_unlock(&fullstop_mutex); + return 0; + +unwind: + mutex_unlock(&fullstop_mutex); + rcu_torture_cleanup(); + return firsterr; +} + +module_init(rcu_torture_init); +module_exit(rcu_torture_cleanup); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c new file mode 100644 index 000000000000..8a2c81e86dda --- /dev/null +++ b/kernel/rcu/tree.c @@ -0,0 +1,3403 @@ +/* + * Read-Copy Update mechanism for mutual exclusion + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Authors: Dipankar Sarma + * Manfred Spraul + * Paul E. McKenney Hierarchical version + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tree.h" +#include + +#include "rcu.h" + +MODULE_ALIAS("rcutree"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcutree." + +/* Data structures. */ + +static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; + +/* + * In order to export the rcu_state name to the tracing tools, it + * needs to be added in the __tracepoint_string section. + * This requires defining a separate variable tp__varname + * that points to the string being used, and this will allow + * the tracing userspace tools to be able to decipher the string + * address to the matching string. + */ +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ +static char sname##_varname[] = #sname; \ +static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ +struct rcu_state sname##_state = { \ + .level = { &sname##_state.node[0] }, \ + .call = cr, \ + .fqs_state = RCU_GP_IDLE, \ + .gpnum = 0UL - 300UL, \ + .completed = 0UL - 300UL, \ + .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ + .orphan_nxttail = &sname##_state.orphan_nxtlist, \ + .orphan_donetail = &sname##_state.orphan_donelist, \ + .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ + .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ + .name = sname##_varname, \ + .abbr = sabbr, \ +}; \ +DEFINE_PER_CPU(struct rcu_data, sname##_data) + +RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); +RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); + +static struct rcu_state *rcu_state; +LIST_HEAD(rcu_struct_flavors); + +/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ +static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; +module_param(rcu_fanout_leaf, int, 0444); +int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; +static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ + NUM_RCU_LVL_0, + NUM_RCU_LVL_1, + NUM_RCU_LVL_2, + NUM_RCU_LVL_3, + NUM_RCU_LVL_4, +}; +int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ + +/* + * The rcu_scheduler_active variable transitions from zero to one just + * before the first task is spawned. So when this variable is zero, RCU + * can assume that there is but one task, allowing RCU to (for example) + * optimize synchronize_sched() to a simple barrier(). When this variable + * is one, RCU must actually do all the hard work required to detect real + * grace periods. This variable is also used to suppress boot-time false + * positives from lockdep-RCU error checking. + */ +int rcu_scheduler_active __read_mostly; +EXPORT_SYMBOL_GPL(rcu_scheduler_active); + +/* + * The rcu_scheduler_fully_active variable transitions from zero to one + * during the early_initcall() processing, which is after the scheduler + * is capable of creating new tasks. So RCU processing (for example, + * creating tasks for RCU priority boosting) must be delayed until after + * rcu_scheduler_fully_active transitions from zero to one. We also + * currently delay invocation of any RCU callbacks until after this point. + * + * It might later prove better for people registering RCU callbacks during + * early boot to take responsibility for these callbacks, but one step at + * a time. + */ +static int rcu_scheduler_fully_active __read_mostly; + +#ifdef CONFIG_RCU_BOOST + +/* + * Control variables for per-CPU and per-rcu_node kthreads. These + * handle all flavors of RCU. + */ +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DEFINE_PER_CPU(char, rcu_cpu_has_work); + +#endif /* #ifdef CONFIG_RCU_BOOST */ + +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static void invoke_rcu_core(void); +static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); + +/* + * Track the rcutorture test sequence number and the update version + * number within a given test. The rcutorture_testseq is incremented + * on every rcutorture module load and unload, so has an odd value + * when a test is running. The rcutorture_vernum is set to zero + * when rcutorture starts and is incremented on each rcutorture update. + * These variables enable correlating rcutorture output with the + * RCU tracing information. + */ +unsigned long rcutorture_testseq; +unsigned long rcutorture_vernum; + +/* + * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s + * permit this function to be invoked without holding the root rcu_node + * structure's ->lock, but of course results can be subject to change. + */ +static int rcu_gp_in_progress(struct rcu_state *rsp) +{ + return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); +} + +/* + * Note a quiescent state. Because we do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period, this just sets a flag. + * The caller must have disabled preemption. + */ +void rcu_sched_qs(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); + + if (rdp->passed_quiesce == 0) + trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); + rdp->passed_quiesce = 1; +} + +void rcu_bh_qs(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); + + if (rdp->passed_quiesce == 0) + trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); + rdp->passed_quiesce = 1; +} + +/* + * Note a context switch. This is a quiescent state for RCU-sched, + * and requires special handling for preemptible RCU. + * The caller must have disabled preemption. + */ +void rcu_note_context_switch(int cpu) +{ + trace_rcu_utilization(TPS("Start context switch")); + rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(cpu); + trace_rcu_utilization(TPS("End context switch")); +} +EXPORT_SYMBOL_GPL(rcu_note_context_switch); + +static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { + .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, + .dynticks = ATOMIC_INIT(1), +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, + .dynticks_idle = ATOMIC_INIT(1), +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ +}; + +static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ +static long qhimark = 10000; /* If this many pending, ignore blimit. */ +static long qlowmark = 100; /* Once only this many pending, use blimit. */ + +module_param(blimit, long, 0444); +module_param(qhimark, long, 0444); +module_param(qlowmark, long, 0444); + +static ulong jiffies_till_first_fqs = ULONG_MAX; +static ulong jiffies_till_next_fqs = ULONG_MAX; + +module_param(jiffies_till_first_fqs, ulong, 0644); +module_param(jiffies_till_next_fqs, ulong, 0644); + +static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp); +static void force_qs_rnp(struct rcu_state *rsp, + int (*f)(struct rcu_data *rsp, bool *isidle, + unsigned long *maxj), + bool *isidle, unsigned long *maxj); +static void force_quiescent_state(struct rcu_state *rsp); +static int rcu_pending(int cpu); + +/* + * Return the number of RCU-sched batches processed thus far for debug & stats. + */ +long rcu_batches_completed_sched(void) +{ + return rcu_sched_state.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); + +/* + * Return the number of RCU BH batches processed thus far for debug & stats. + */ +long rcu_batches_completed_bh(void) +{ + return rcu_bh_state.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); + +/* + * Force a quiescent state for RCU BH. + */ +void rcu_bh_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_bh_state); +} +EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); + +/* + * Record the number of times rcutorture tests have been initiated and + * terminated. This information allows the debugfs tracing stats to be + * correlated to the rcutorture messages, even when the rcutorture module + * is being repeatedly loaded and unloaded. In other words, we cannot + * store this state in rcutorture itself. + */ +void rcutorture_record_test_transition(void) +{ + rcutorture_testseq++; + rcutorture_vernum = 0; +} +EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); + +/* + * Record the number of writer passes through the current rcutorture test. + * This is also used to correlate debugfs tracing stats with the rcutorture + * messages. + */ +void rcutorture_record_progress(unsigned long vernum) +{ + rcutorture_vernum++; +} +EXPORT_SYMBOL_GPL(rcutorture_record_progress); + +/* + * Force a quiescent state for RCU-sched. + */ +void rcu_sched_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_sched_state); +} +EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); + +/* + * Does the CPU have callbacks ready to be invoked? + */ +static int +cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) +{ + return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && + rdp->nxttail[RCU_DONE_TAIL] != NULL; +} + +/* + * Does the current CPU require a not-yet-started grace period? + * The caller must have disabled interrupts to prevent races with + * normal callback registry. + */ +static int +cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) +{ + int i; + + if (rcu_gp_in_progress(rsp)) + return 0; /* No, a grace period is already in progress. */ + if (rcu_nocb_needs_gp(rsp)) + return 1; /* Yes, a no-CBs CPU needs one. */ + if (!rdp->nxttail[RCU_NEXT_TAIL]) + return 0; /* No, this is a no-CBs (or offline) CPU. */ + if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) + return 1; /* Yes, this CPU has newly registered callbacks. */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) + if (rdp->nxttail[i - 1] != rdp->nxttail[i] && + ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), + rdp->nxtcompleted[i])) + return 1; /* Yes, CBs for future grace period. */ + return 0; /* No grace period needed. */ +} + +/* + * Return the root node of the specified rcu_state structure. + */ +static struct rcu_node *rcu_get_root(struct rcu_state *rsp) +{ + return &rsp->node[0]; +} + +/* + * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state + * + * If the new value of the ->dynticks_nesting counter now is zero, + * we really have entered idle, and must do the appropriate accounting. + * The caller must have disabled interrupts. + */ +static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, + bool user) +{ + trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); + if (!user && !is_idle_task(current)) { + struct task_struct *idle __maybe_unused = + idle_task(smp_processor_id()); + + trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); + ftrace_dump(DUMP_ORIG); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ + } + rcu_prepare_for_idle(smp_processor_id()); + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + + /* + * It is illegal to enter an extended quiescent state while + * in an RCU read-side critical section. + */ + rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), + "Illegal idle entry in RCU read-side critical section."); + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), + "Illegal idle entry in RCU-bh read-side critical section."); + rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), + "Illegal idle entry in RCU-sched read-side critical section."); +} + +/* + * Enter an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. + */ +static void rcu_eqs_enter(bool user) +{ + long long oldval; + struct rcu_dynticks *rdtp; + + rdtp = this_cpu_ptr(&rcu_dynticks); + oldval = rdtp->dynticks_nesting; + WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); + if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + rdtp->dynticks_nesting = 0; + else + rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; + rcu_eqs_enter_common(rdtp, oldval, user); +} + +/** + * rcu_idle_enter - inform RCU that current CPU is entering idle + * + * Enter idle mode, in other words, -leave- the mode in which RCU + * read-side critical sections can occur. (Though RCU read-side + * critical sections can occur in irq handlers in idle, a possibility + * handled by irq_enter() and irq_exit().) + * + * We crowbar the ->dynticks_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. + */ +void rcu_idle_enter(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_eqs_enter(false); + rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(rcu_idle_enter); + +#ifdef CONFIG_RCU_USER_QS +/** + * rcu_user_enter - inform RCU that we are resuming userspace. + * + * Enter RCU idle mode right before resuming userspace. No use of RCU + * is permitted between this call and rcu_user_exit(). This way the + * CPU doesn't need to maintain the tick for RCU maintenance purposes + * when the CPU runs in userspace. + */ +void rcu_user_enter(void) +{ + rcu_eqs_enter(1); +} +#endif /* CONFIG_RCU_USER_QS */ + +/** + * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle + * + * Exit from an interrupt handler, which might possibly result in entering + * idle mode, in other words, leaving the mode in which read-side critical + * sections can occur. + * + * This code assumes that the idle loop never does anything that might + * result in unbalanced calls to irq_enter() and irq_exit(). If your + * architecture violates this assumption, RCU will give you what you + * deserve, good and hard. But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. + */ +void rcu_irq_exit(void) +{ + unsigned long flags; + long long oldval; + struct rcu_dynticks *rdtp; + + local_irq_save(flags); + rdtp = this_cpu_ptr(&rcu_dynticks); + oldval = rdtp->dynticks_nesting; + rdtp->dynticks_nesting--; + WARN_ON_ONCE(rdtp->dynticks_nesting < 0); + if (rdtp->dynticks_nesting) + trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); + else + rcu_eqs_enter_common(rdtp, oldval, true); + rcu_sysidle_enter(rdtp, 1); + local_irq_restore(flags); +} + +/* + * rcu_eqs_exit_common - current CPU moving away from extended quiescent state + * + * If the new value of the ->dynticks_nesting counter was previously zero, + * we really have exited idle, and must do the appropriate accounting. + * The caller must have disabled interrupts. + */ +static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, + int user) +{ + smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ + atomic_inc(&rdtp->dynticks); + /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ + smp_mb__after_atomic_inc(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + rcu_cleanup_after_idle(smp_processor_id()); + trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); + if (!user && !is_idle_task(current)) { + struct task_struct *idle __maybe_unused = + idle_task(smp_processor_id()); + + trace_rcu_dyntick(TPS("Error on exit: not idle task"), + oldval, rdtp->dynticks_nesting); + ftrace_dump(DUMP_ORIG); + WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", + current->pid, current->comm, + idle->pid, idle->comm); /* must be idle task! */ + } +} + +/* + * Exit an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. + */ +static void rcu_eqs_exit(bool user) +{ + struct rcu_dynticks *rdtp; + long long oldval; + + rdtp = this_cpu_ptr(&rcu_dynticks); + oldval = rdtp->dynticks_nesting; + WARN_ON_ONCE(oldval < 0); + if (oldval & DYNTICK_TASK_NEST_MASK) + rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; + else + rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rcu_eqs_exit_common(rdtp, oldval, user); +} + +/** + * rcu_idle_exit - inform RCU that current CPU is leaving idle + * + * Exit idle mode, in other words, -enter- the mode in which RCU + * read-side critical sections can occur. + * + * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to + * allow for the possibility of usermode upcalls messing up our count + * of interrupt nesting level during the busy period that is just + * now starting. + */ +void rcu_idle_exit(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_eqs_exit(false); + rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(rcu_idle_exit); + +#ifdef CONFIG_RCU_USER_QS +/** + * rcu_user_exit - inform RCU that we are exiting userspace. + * + * Exit RCU idle mode while entering the kernel because it can + * run a RCU read side critical section anytime. + */ +void rcu_user_exit(void) +{ + rcu_eqs_exit(1); +} +#endif /* CONFIG_RCU_USER_QS */ + +/** + * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle + * + * Enter an interrupt handler, which might possibly result in exiting + * idle mode, in other words, entering the mode in which read-side critical + * sections can occur. + * + * Note that the Linux kernel is fully capable of entering an interrupt + * handler that it never exits, for example when doing upcalls to + * user mode! This code assumes that the idle loop never does upcalls to + * user mode. If your architecture does do upcalls from the idle loop (or + * does anything else that results in unbalanced calls to the irq_enter() + * and irq_exit() functions), RCU will give you what you deserve, good + * and hard. But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. + */ +void rcu_irq_enter(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + long long oldval; + + local_irq_save(flags); + rdtp = this_cpu_ptr(&rcu_dynticks); + oldval = rdtp->dynticks_nesting; + rdtp->dynticks_nesting++; + WARN_ON_ONCE(rdtp->dynticks_nesting == 0); + if (oldval) + trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); + else + rcu_eqs_exit_common(rdtp, oldval, true); + rcu_sysidle_exit(rdtp, 1); + local_irq_restore(flags); +} + +/** + * rcu_nmi_enter - inform RCU of entry to NMI context + * + * If the CPU was idle with dynamic ticks active, and there is no + * irq handler running, this updates rdtp->dynticks_nmi to let the + * RCU grace-period handling know that the CPU is active. + */ +void rcu_nmi_enter(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + if (rdtp->dynticks_nmi_nesting == 0 && + (atomic_read(&rdtp->dynticks) & 0x1)) + return; + rdtp->dynticks_nmi_nesting++; + smp_mb__before_atomic_inc(); /* Force delay from prior write. */ + atomic_inc(&rdtp->dynticks); + /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ + smp_mb__after_atomic_inc(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); +} + +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * + * If the CPU was idle with dynamic ticks active, and there is no + * irq handler running, this updates rdtp->dynticks_nmi to let the + * RCU grace-period handling know that the CPU is no longer active. + */ +void rcu_nmi_exit(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + if (rdtp->dynticks_nmi_nesting == 0 || + --rdtp->dynticks_nmi_nesting != 0) + return; + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force delay to next write. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); +} + +/** + * __rcu_is_watching - are RCU read-side critical sections safe? + * + * Return true if RCU is watching the running CPU, which means that + * this CPU can safely enter RCU read-side critical sections. Unlike + * rcu_is_watching(), the caller of __rcu_is_watching() must have at + * least disabled preemption. + */ +bool __rcu_is_watching(void) +{ + return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; +} + +/** + * rcu_is_watching - see if RCU thinks that the current CPU is idle + * + * If the current CPU is in its idle loop and is neither in an interrupt + * or NMI handler, return true. + */ +bool rcu_is_watching(void) +{ + int ret; + + preempt_disable(); + ret = __rcu_is_watching(); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(rcu_is_watching); + +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) + +/* + * Is the current CPU online? Disable preemption to avoid false positives + * that could otherwise happen due to the current CPU number being sampled, + * this task being preempted, its old CPU being taken offline, resuming + * on some other CPU, then determining that its old CPU is now offline. + * It is OK to use RCU on an offline processor during initial boot, hence + * the check for rcu_scheduler_fully_active. Note also that it is OK + * for a CPU coming online to use RCU for one jiffy prior to marking itself + * online in the cpu_online_mask. Similarly, it is OK for a CPU going + * offline to continue to use RCU for one jiffy after marking itself + * offline in the cpu_online_mask. This leniency is necessary given the + * non-atomic nature of the online and offline processing, for example, + * the fact that a CPU enters the scheduler after completing the CPU_DYING + * notifiers. + * + * This is also why RCU internally marks CPUs online during the + * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. + * + * Disable checking if in an NMI handler because we cannot safely report + * errors from NMI handlers anyway. + */ +bool rcu_lockdep_current_cpu_online(void) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + bool ret; + + if (in_nmi()) + return 1; + preempt_disable(); + rdp = this_cpu_ptr(&rcu_sched_data); + rnp = rdp->mynode; + ret = (rdp->grpmask & rnp->qsmaskinit) || + !rcu_scheduler_fully_active; + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ + +/** + * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle + * + * If the current CPU is idle or running at a first-level (not nested) + * interrupt from idle, return true. The caller must have at least + * disabled preemption. + */ +static int rcu_is_cpu_rrupt_from_idle(void) +{ + return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; +} + +/* + * Snapshot the specified CPU's dynticks counter so that we can later + * credit them with an implicit quiescent state. Return 1 if this CPU + * is in dynticks idle mode, which is an extended quiescent state. + */ +static int dyntick_save_progress_counter(struct rcu_data *rdp, + bool *isidle, unsigned long *maxj) +{ + rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); + rcu_sysidle_check_cpu(rdp, isidle, maxj); + return (rdp->dynticks_snap & 0x1) == 0; +} + +/* + * Return true if the specified CPU has passed through a quiescent + * state by virtue of being in or having passed through an dynticks + * idle state since the last call to dyntick_save_progress_counter() + * for this same CPU, or by virtue of having been offline. + */ +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, + bool *isidle, unsigned long *maxj) +{ + unsigned int curr; + unsigned int snap; + + curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); + snap = (unsigned int)rdp->dynticks_snap; + + /* + * If the CPU passed through or entered a dynticks idle phase with + * no active irq/NMI handlers, then we can safely pretend that the CPU + * already acknowledged the request to pass through a quiescent + * state. Either way, that CPU cannot possibly be in an RCU + * read-side critical section that started before the beginning + * of the current RCU grace period. + */ + if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); + rdp->dynticks_fqs++; + return 1; + } + + /* + * Check for the CPU being offline, but only if the grace period + * is old enough. We don't need to worry about the CPU changing + * state: If we see it offline even once, it has been through a + * quiescent state. + * + * The reason for insisting that the grace period be at least + * one jiffy old is that CPUs that are not quite online and that + * have just gone offline can still execute RCU read-side critical + * sections. + */ + if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) + return 0; /* Grace period is not old enough. */ + barrier(); + if (cpu_is_offline(rdp->cpu)) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); + rdp->offline_fqs++; + return 1; + } + + /* + * There is a possibility that a CPU in adaptive-ticks state + * might run in the kernel with the scheduling-clock tick disabled + * for an extended time period. Invoke rcu_kick_nohz_cpu() to + * force the CPU to restart the scheduling-clock tick in this + * CPU is in this state. + */ + rcu_kick_nohz_cpu(rdp->cpu); + + return 0; +} + +static void record_gp_stall_check_time(struct rcu_state *rsp) +{ + unsigned long j = ACCESS_ONCE(jiffies); + + rsp->gp_start = j; + smp_wmb(); /* Record start time before stall time. */ + rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); +} + +/* + * Dump stacks of all tasks running on stalled CPUs. This is a fallback + * for architectures that do not implement trigger_all_cpu_backtrace(). + * The NMI-triggered stack traces are more accurate because they are + * printed by the target CPU. + */ +static void rcu_dump_cpu_stacks(struct rcu_state *rsp) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + if (rnp->qsmask != 0) { + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) + dump_cpu_task(rnp->grplo + cpu); + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +static void print_other_cpu_stall(struct rcu_state *rsp) +{ + int cpu; + long delta; + unsigned long flags; + int ndetected = 0; + struct rcu_node *rnp = rcu_get_root(rsp); + long totqlen = 0; + + /* Only let one CPU complain about others per time interval. */ + + raw_spin_lock_irqsave(&rnp->lock, flags); + delta = jiffies - rsp->jiffies_stall; + if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* + * OK, time to rat on our buddy... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ + pr_err("INFO: %s detected stalls on CPUs/tasks:", + rsp->name); + print_cpu_stall_info_begin(); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + ndetected += rcu_print_task_stall(rnp); + if (rnp->qsmask != 0) { + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) { + print_cpu_stall_info(rsp, + rnp->grplo + cpu); + ndetected++; + } + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } + + /* + * Now rat on any tasks that got kicked up to the root rcu_node + * due to CPU offlining. + */ + rnp = rcu_get_root(rsp); + raw_spin_lock_irqsave(&rnp->lock, flags); + ndetected += rcu_print_task_stall(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + print_cpu_stall_info_end(); + for_each_possible_cpu(cpu) + totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", + smp_processor_id(), (long)(jiffies - rsp->gp_start), + rsp->gpnum, rsp->completed, totqlen); + if (ndetected == 0) + pr_err("INFO: Stall ended before state dump start\n"); + else if (!trigger_all_cpu_backtrace()) + rcu_dump_cpu_stacks(rsp); + + /* Complain about tasks blocking the grace period. */ + + rcu_print_detail_task_stall(rsp); + + force_quiescent_state(rsp); /* Kick them all. */ +} + +static void print_cpu_stall(struct rcu_state *rsp) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp = rcu_get_root(rsp); + long totqlen = 0; + + /* + * OK, time to rat on ourselves... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ + pr_err("INFO: %s self-detected stall on CPU", rsp->name); + print_cpu_stall_info_begin(); + print_cpu_stall_info(rsp, smp_processor_id()); + print_cpu_stall_info_end(); + for_each_possible_cpu(cpu) + totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", + jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); + if (!trigger_all_cpu_backtrace()) + dump_stack(); + + raw_spin_lock_irqsave(&rnp->lock, flags); + if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) + rsp->jiffies_stall = jiffies + + 3 * rcu_jiffies_till_stall_check() + 3; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long completed; + unsigned long gpnum; + unsigned long gps; + unsigned long j; + unsigned long js; + struct rcu_node *rnp; + + if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) + return; + j = ACCESS_ONCE(jiffies); + + /* + * Lots of memory barriers to reject false positives. + * + * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, + * then rsp->gp_start, and finally rsp->completed. These values + * are updated in the opposite order with memory barriers (or + * equivalent) during grace-period initialization and cleanup. + * Now, a false positive can occur if we get an new value of + * rsp->gp_start and a old value of rsp->jiffies_stall. But given + * the memory barriers, the only way that this can happen is if one + * grace period ends and another starts between these two fetches. + * Detect this by comparing rsp->completed with the previous fetch + * from rsp->gpnum. + * + * Given this check, comparisons of jiffies, rsp->jiffies_stall, + * and rsp->gp_start suffice to forestall false positives. + */ + gpnum = ACCESS_ONCE(rsp->gpnum); + smp_rmb(); /* Pick up ->gpnum first... */ + js = ACCESS_ONCE(rsp->jiffies_stall); + smp_rmb(); /* ...then ->jiffies_stall before the rest... */ + gps = ACCESS_ONCE(rsp->gp_start); + smp_rmb(); /* ...and finally ->gp_start before ->completed. */ + completed = ACCESS_ONCE(rsp->completed); + if (ULONG_CMP_GE(completed, gpnum) || + ULONG_CMP_LT(j, js) || + ULONG_CMP_GE(gps, js)) + return; /* No stall or GP completed since entering function. */ + rnp = rdp->mynode; + if (rcu_gp_in_progress(rsp) && + (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rsp); + + } else if (rcu_gp_in_progress(rsp) && + ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { + + /* They had a few time units to dump stack, so complain. */ + print_other_cpu_stall(rsp); + } +} + +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rsp->jiffies_stall = jiffies + ULONG_MAX / 2; +} + +/* + * Initialize the specified rcu_data structure's callback list to empty. + */ +static void init_callback_list(struct rcu_data *rdp) +{ + int i; + + if (init_nocb_callback_list(rdp)) + return; + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; +} + +/* + * Determine the value that ->completed will have at the end of the + * next subsequent grace period. This is used to tag callbacks so that + * a CPU can invoke callbacks in a timely fashion even if that CPU has + * been dyntick-idle for an extended period with callbacks under the + * influence of RCU_FAST_NO_HZ. + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static unsigned long rcu_cbs_completed(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + /* + * If RCU is idle, we just wait for the next grace period. + * But we can only be sure that RCU is idle if we are looking + * at the root rcu_node structure -- otherwise, a new grace + * period might have started, but just not yet gotten around + * to initializing the current non-root rcu_node structure. + */ + if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) + return rnp->completed + 1; + + /* + * Otherwise, wait for a possible partial grace period and + * then the subsequent full grace period. + */ + return rnp->completed + 2; +} + +/* + * Trace-event helper function for rcu_start_future_gp() and + * rcu_nocb_wait_gp(). + */ +static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c, const char *s) +{ + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, + rnp->completed, c, rnp->level, + rnp->grplo, rnp->grphi, s); +} + +/* + * Start some future grace period, as needed to handle newly arrived + * callbacks. The required future grace periods are recorded in each + * rcu_node structure's ->need_future_gp field. + * + * The caller must hold the specified rcu_node structure's ->lock. + */ +static unsigned long __maybe_unused +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) +{ + unsigned long c; + int i; + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + + /* + * Pick up grace-period number for new callbacks. If this + * grace period is already marked as needed, return to the caller. + */ + c = rcu_cbs_completed(rdp->rsp, rnp); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); + if (rnp->need_future_gp[c & 0x1]) { + trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); + return c; + } + + /* + * If either this rcu_node structure or the root rcu_node structure + * believe that a grace period is in progress, then we must wait + * for the one following, which is in "c". Because our request + * will be noticed at the end of the current grace period, we don't + * need to explicitly start one. + */ + if (rnp->gpnum != rnp->completed || + ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { + rnp->need_future_gp[c & 0x1]++; + trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); + return c; + } + + /* + * There might be no grace period in progress. If we don't already + * hold it, acquire the root rcu_node structure's lock in order to + * start one (if needed). + */ + if (rnp != rnp_root) + raw_spin_lock(&rnp_root->lock); + + /* + * Get a new grace-period number. If there really is no grace + * period in progress, it will be smaller than the one we obtained + * earlier. Adjust callbacks as needed. Note that even no-CBs + * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. + */ + c = rcu_cbs_completed(rdp->rsp, rnp_root); + for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) + if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) + rdp->nxtcompleted[i] = c; + + /* + * If the needed for the required grace period is already + * recorded, trace and leave. + */ + if (rnp_root->need_future_gp[c & 0x1]) { + trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); + goto unlock_out; + } + + /* Record the need for the future grace period. */ + rnp_root->need_future_gp[c & 0x1]++; + + /* If a grace period is not already in progress, start one. */ + if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); + } else { + trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); + rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); + } +unlock_out: + if (rnp != rnp_root) + raw_spin_unlock(&rnp_root->lock); + return c; +} + +/* + * Clean up any old requests for the just-ended grace period. Also return + * whether any additional grace periods have been requested. Also invoke + * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads + * waiting for this grace period to complete. + */ +static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ + int c = rnp->completed; + int needmore; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + + rcu_nocb_gp_cleanup(rsp, rnp); + rnp->need_future_gp[c & 0x1] = 0; + needmore = rnp->need_future_gp[(c + 1) & 0x1]; + trace_rcu_future_gp(rnp, rdp, c, + needmore ? TPS("CleanupMore") : TPS("Cleanup")); + return needmore; +} + +/* + * If there is room, assign a ->completed number to any callbacks on + * this CPU that have not already been assigned. Also accelerate any + * callbacks that were previously assigned a ->completed number that has + * since proven to be too conservative, which can happen if callbacks get + * assigned a ->completed number while RCU is idle, but with reference to + * a non-root rcu_node structure. This function is idempotent, so it does + * not hurt to call it repeatedly. + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + unsigned long c; + int i; + + /* If the CPU has no callbacks, nothing to do. */ + if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + return; + + /* + * Starting from the sublist containing the callbacks most + * recently assigned a ->completed number and working down, find the + * first sublist that is not assignable to an upcoming grace period. + * Such a sublist has something in it (first two tests) and has + * a ->completed number assigned that will complete sooner than + * the ->completed number for newly arrived callbacks (last test). + * + * The key point is that any later sublist can be assigned the + * same ->completed number as the newly arrived callbacks, which + * means that the callbacks in any of these later sublist can be + * grouped into a single sublist, whether or not they have already + * been assigned a ->completed number. + */ + c = rcu_cbs_completed(rsp, rnp); + for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) + if (rdp->nxttail[i] != rdp->nxttail[i - 1] && + !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) + break; + + /* + * If there are no sublist for unassigned callbacks, leave. + * At the same time, advance "i" one sublist, so that "i" will + * index into the sublist where all the remaining callbacks should + * be grouped into. + */ + if (++i >= RCU_NEXT_TAIL) + return; + + /* + * Assign all subsequent callbacks' ->completed number to the next + * full grace period and group them all in the sublist initially + * indexed by "i". + */ + for (; i <= RCU_NEXT_TAIL; i++) { + rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxtcompleted[i] = c; + } + /* Record any needed additional grace periods. */ + rcu_start_future_gp(rnp, rdp); + + /* Trace depending on how much we were able to accelerate. */ + if (!*rdp->nxttail[RCU_WAIT_TAIL]) + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); + else + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); +} + +/* + * Move any callbacks whose grace period has completed to the + * RCU_DONE_TAIL sublist, then compact the remaining sublists and + * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL + * sublist. This function is idempotent, so it does not hurt to + * invoke it repeatedly. As long as it is not invoked -too- often... + * + * The caller must hold rnp->lock with interrupts disabled. + */ +static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + int i, j; + + /* If the CPU has no callbacks, nothing to do. */ + if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + return; + + /* + * Find all callbacks whose ->completed numbers indicate that they + * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. + */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { + if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) + break; + rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; + } + /* Clean up any sublist tail pointers that were misordered above. */ + for (j = RCU_WAIT_TAIL; j < i; j++) + rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; + + /* Copy down callbacks to fill in empty sublists. */ + for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { + if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) + break; + rdp->nxttail[j] = rdp->nxttail[i]; + rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; + } + + /* Classify any remaining callbacks. */ + rcu_accelerate_cbs(rsp, rnp, rdp); +} + +/* + * Update CPU-local rcu_data state to record the beginnings and ends of + * grace periods. The caller must hold the ->lock of the leaf rcu_node + * structure corresponding to the current CPU, and must have irqs disabled. + */ +static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + /* Handle the ends of any preceding grace periods first. */ + if (rdp->completed == rnp->completed) { + + /* No grace period end, so just accelerate recent callbacks. */ + rcu_accelerate_cbs(rsp, rnp, rdp); + + } else { + + /* Advance callbacks. */ + rcu_advance_cbs(rsp, rnp, rdp); + + /* Remember that we saw this grace-period completion. */ + rdp->completed = rnp->completed; + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); + } + + if (rdp->gpnum != rnp->gpnum) { + /* + * If the current grace period is waiting for this CPU, + * set up to detect a quiescent state, otherwise don't + * go looking for one. + */ + rdp->gpnum = rnp->gpnum; + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); + rdp->passed_quiesce = 0; + rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); + zero_cpu_stall_ticks(rdp); + } +} + +static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_node *rnp; + + local_irq_save(flags); + rnp = rdp->mynode; + if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && + rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ + !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ + local_irq_restore(flags); + return; + } + __note_gp_changes(rsp, rnp, rdp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Initialize a new grace period. Return 0 if no grace period required. + */ +static int rcu_gp_init(struct rcu_state *rsp) +{ + struct rcu_data *rdp; + struct rcu_node *rnp = rcu_get_root(rsp); + + rcu_bind_gp_kthread(); + raw_spin_lock_irq(&rnp->lock); + if (rsp->gp_flags == 0) { + /* Spurious wakeup, tell caller to go back to sleep. */ + raw_spin_unlock_irq(&rnp->lock); + return 0; + } + rsp->gp_flags = 0; /* Clear all flags: New grace period. */ + + if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { + /* + * Grace period already in progress, don't start another. + * Not supposed to be able to happen. + */ + raw_spin_unlock_irq(&rnp->lock); + return 0; + } + + /* Advance to a new grace period and initialize state. */ + record_gp_stall_check_time(rsp); + smp_wmb(); /* Record GP times before starting GP. */ + rsp->gpnum++; + trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); + raw_spin_unlock_irq(&rnp->lock); + + /* Exclude any concurrent CPU-hotplug operations. */ + mutex_lock(&rsp->onoff_mutex); + + /* + * Set the quiescent-state-needed bits in all the rcu_node + * structures for all currently online CPUs in breadth-first order, + * starting from the root rcu_node structure, relying on the layout + * of the tree within the rsp->node[] array. Note that other CPUs + * will access only the leaves of the hierarchy, thus seeing that no + * grace period is in progress, at least until the corresponding + * leaf node has been initialized. In addition, we have excluded + * CPU-hotplug operations. + * + * The grace period cannot complete until the initialization + * process finishes, because this kthread handles both. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irq(&rnp->lock); + rdp = this_cpu_ptr(rsp->rda); + rcu_preempt_check_blocked_tasks(rnp); + rnp->qsmask = rnp->qsmaskinit; + ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; + WARN_ON_ONCE(rnp->completed != rsp->completed); + ACCESS_ONCE(rnp->completed) = rsp->completed; + if (rnp == rdp->mynode) + __note_gp_changes(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); + trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + rnp->level, rnp->grplo, + rnp->grphi, rnp->qsmask); + raw_spin_unlock_irq(&rnp->lock); +#ifdef CONFIG_PROVE_RCU_DELAY + if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && + system_state == SYSTEM_RUNNING) + udelay(200); +#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ + cond_resched(); + } + + mutex_unlock(&rsp->onoff_mutex); + return 1; +} + +/* + * Do one round of quiescent-state forcing. + */ +static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) +{ + int fqs_state = fqs_state_in; + bool isidle = false; + unsigned long maxj; + struct rcu_node *rnp = rcu_get_root(rsp); + + rsp->n_force_qs++; + if (fqs_state == RCU_SAVE_DYNTICK) { + /* Collect dyntick-idle snapshots. */ + if (is_sysidle_rcu_state(rsp)) { + isidle = 1; + maxj = jiffies - ULONG_MAX / 4; + } + force_qs_rnp(rsp, dyntick_save_progress_counter, + &isidle, &maxj); + rcu_sysidle_report_gp(rsp, isidle, maxj); + fqs_state = RCU_FORCE_QS; + } else { + /* Handle dyntick-idle and offline CPUs. */ + isidle = 0; + force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); + } + /* Clear flag to prevent immediate re-entry. */ + if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + raw_spin_lock_irq(&rnp->lock); + rsp->gp_flags &= ~RCU_GP_FLAG_FQS; + raw_spin_unlock_irq(&rnp->lock); + } + return fqs_state; +} + +/* + * Clean up after the old grace period. + */ +static void rcu_gp_cleanup(struct rcu_state *rsp) +{ + unsigned long gp_duration; + int nocb = 0; + struct rcu_data *rdp; + struct rcu_node *rnp = rcu_get_root(rsp); + + raw_spin_lock_irq(&rnp->lock); + gp_duration = jiffies - rsp->gp_start; + if (gp_duration > rsp->gp_max) + rsp->gp_max = gp_duration; + + /* + * We know the grace period is complete, but to everyone else + * it appears to still be ongoing. But it is also the case + * that to everyone else it looks like there is nothing that + * they can do to advance the grace period. It is therefore + * safe for us to drop the lock in order to mark the grace + * period as completed in all of the rcu_node structures. + */ + raw_spin_unlock_irq(&rnp->lock); + + /* + * Propagate new ->completed value to rcu_node structures so + * that other CPUs don't have to wait until the start of the next + * grace period to process their callbacks. This also avoids + * some nasty RCU grace-period initialization races by forcing + * the end of the current grace period to be completely recorded in + * all of the rcu_node structures before the beginning of the next + * grace period is recorded in any of the rcu_node structures. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irq(&rnp->lock); + ACCESS_ONCE(rnp->completed) = rsp->gpnum; + rdp = this_cpu_ptr(rsp->rda); + if (rnp == rdp->mynode) + __note_gp_changes(rsp, rnp, rdp); + nocb += rcu_future_gp_cleanup(rsp, rnp); + raw_spin_unlock_irq(&rnp->lock); + cond_resched(); + } + rnp = rcu_get_root(rsp); + raw_spin_lock_irq(&rnp->lock); + rcu_nocb_gp_set(rnp, nocb); + + rsp->completed = rsp->gpnum; /* Declare grace period done. */ + trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); + rsp->fqs_state = RCU_GP_IDLE; + rdp = this_cpu_ptr(rsp->rda); + rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ + if (cpu_needs_another_gp(rsp, rdp)) { + rsp->gp_flags = RCU_GP_FLAG_INIT; + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("newreq")); + } + raw_spin_unlock_irq(&rnp->lock); +} + +/* + * Body of kthread that handles grace periods. + */ +static int __noreturn rcu_gp_kthread(void *arg) +{ + int fqs_state; + int gf; + unsigned long j; + int ret; + struct rcu_state *rsp = arg; + struct rcu_node *rnp = rcu_get_root(rsp); + + for (;;) { + + /* Handle grace-period start. */ + for (;;) { + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("reqwait")); + wait_event_interruptible(rsp->gp_wq, + ACCESS_ONCE(rsp->gp_flags) & + RCU_GP_FLAG_INIT); + if (rcu_gp_init(rsp)) + break; + cond_resched(); + flush_signals(current); + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("reqwaitsig")); + } + + /* Handle quiescent-state forcing. */ + fqs_state = RCU_SAVE_DYNTICK; + j = jiffies_till_first_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_first_fqs = HZ; + } + ret = 0; + for (;;) { + if (!ret) + rsp->jiffies_force_qs = jiffies + j; + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("fqswait")); + ret = wait_event_interruptible_timeout(rsp->gp_wq, + ((gf = ACCESS_ONCE(rsp->gp_flags)) & + RCU_GP_FLAG_FQS) || + (!ACCESS_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)), + j); + /* If grace period done, leave loop. */ + if (!ACCESS_ONCE(rnp->qsmask) && + !rcu_preempt_blocked_readers_cgp(rnp)) + break; + /* If time for quiescent-state forcing, do it. */ + if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || + (gf & RCU_GP_FLAG_FQS)) { + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("fqsstart")); + fqs_state = rcu_gp_fqs(rsp, fqs_state); + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("fqsend")); + cond_resched(); + } else { + /* Deal with stray signal. */ + cond_resched(); + flush_signals(current); + trace_rcu_grace_period(rsp->name, + ACCESS_ONCE(rsp->gpnum), + TPS("fqswaitsig")); + } + j = jiffies_till_next_fqs; + if (j > HZ) { + j = HZ; + jiffies_till_next_fqs = HZ; + } else if (j < 1) { + j = 1; + jiffies_till_next_fqs = 1; + } + } + + /* Handle grace-period end. */ + rcu_gp_cleanup(rsp); + } +} + +static void rsp_wakeup(struct irq_work *work) +{ + struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); + + /* Wake up rcu_gp_kthread() to start the grace period. */ + wake_up(&rsp->gp_wq); +} + +/* + * Start a new RCU grace period if warranted, re-initializing the hierarchy + * in preparation for detecting the next grace period. The caller must hold + * the root node's ->lock and hard irqs must be disabled. + * + * Note that it is legal for a dying CPU (which is marked as offline) to + * invoke this function. This can happen when the dying CPU reports its + * quiescent state. + */ +static void +rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) +{ + if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { + /* + * Either we have not yet spawned the grace-period + * task, this CPU does not need another grace period, + * or a grace period is already in progress. + * Either way, don't start a new grace period. + */ + return; + } + rsp->gp_flags = RCU_GP_FLAG_INIT; + trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), + TPS("newreq")); + + /* + * We can't do wakeups while holding the rnp->lock, as that + * could cause possible deadlocks with the rq->lock. Defer + * the wakeup to interrupt context. And don't bother waking + * up the running kthread. + */ + if (current != rsp->gp_kthread) + irq_work_queue(&rsp->wakeup_work); +} + +/* + * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's + * callbacks. Note that rcu_start_gp_advanced() cannot do this because it + * is invoked indirectly from rcu_advance_cbs(), which would result in + * endless recursion -- or would do so if it wasn't for the self-deadlock + * that is encountered beforehand. + */ +static void +rcu_start_gp(struct rcu_state *rsp) +{ + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_node *rnp = rcu_get_root(rsp); + + /* + * If there is no grace period in progress right now, any + * callbacks we have up to this point will be satisfied by the + * next grace period. Also, advancing the callbacks reduces the + * probability of false positives from cpu_needs_another_gp() + * resulting in pointless grace periods. So, advance callbacks + * then start the grace period! + */ + rcu_advance_cbs(rsp, rnp, rdp); + rcu_start_gp_advanced(rsp, rnp, rdp); +} + +/* + * Report a full set of quiescent states to the specified rcu_state + * data structure. This involves cleaning up after the prior grace + * period and letting rcu_start_gp() start up the next grace period + * if one is needed. Note that the caller must hold rnp->lock, which + * is released before return. + */ +static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) + __releases(rcu_get_root(rsp)->lock) +{ + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); + raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ +} + +/* + * Similar to rcu_report_qs_rdp(), for which it is a helper function. + * Allows quiescent states for a group of CPUs to be reported at one go + * to the specified rcu_node structure, though all the CPUs in the group + * must be represented by the same rcu_node structure (which need not be + * a leaf rcu_node structure, though it often will be). That structure's + * lock must be held upon entry, and it is released before return. + */ +static void +rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) +{ + struct rcu_node *rnp_c; + + /* Walk up the rcu_node hierarchy. */ + for (;;) { + if (!(rnp->qsmask & mask)) { + + /* Our bit has already been cleared, so done. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + rnp->qsmask &= ~mask; + trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, + mask, rnp->qsmask, rnp->level, + rnp->grplo, rnp->grphi, + !!rnp->gp_tasks); + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + + /* Other bits still set at this level, so done. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + mask = rnp->grpmask; + if (rnp->parent == NULL) { + + /* No more levels. Exit loop holding root lock. */ + + break; + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); + rnp_c = rnp; + rnp = rnp->parent; + raw_spin_lock_irqsave(&rnp->lock, flags); + WARN_ON_ONCE(rnp_c->qsmask); + } + + /* + * Get here if we are the last CPU to pass through a quiescent + * state for this grace period. Invoke rcu_report_qs_rsp() + * to clean up and start the next grace period if one is needed. + */ + rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ +} + +/* + * Record a quiescent state for the specified CPU to that CPU's rcu_data + * structure. This must be either called from the specified CPU, or + * called when the specified CPU is known to be offline (and when it is + * also known that no other CPU is concurrently trying to help the offline + * CPU). The lastcomp argument is used to make sure we are still in the + * grace period of interest. We don't want to end the current grace period + * based on quiescent states detected in an earlier grace period! + */ +static void +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp; + + rnp = rdp->mynode; + raw_spin_lock_irqsave(&rnp->lock, flags); + if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || + rnp->completed == rnp->gpnum) { + + /* + * The grace period in which this quiescent state was + * recorded has ended, so don't report it upwards. + * We will instead need a new quiescent state that lies + * within the current grace period. + */ + rdp->passed_quiesce = 0; /* need qs for new gp. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + mask = rdp->grpmask; + if ((rnp->qsmask & mask) == 0) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } else { + rdp->qs_pending = 0; + + /* + * This GP can't end until cpu checks in, so all of our + * callbacks can be processed during the next GP. + */ + rcu_accelerate_cbs(rsp, rnp, rdp); + + rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ + } +} + +/* + * Check to see if there is a new grace period of which this CPU + * is not yet aware, and if so, set up local rcu_data state for it. + * Otherwise, see if this CPU has just passed through its first + * quiescent state for this grace period, and record that fact if so. + */ +static void +rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) +{ + /* Check for grace-period ends and beginnings. */ + note_gp_changes(rsp, rdp); + + /* + * Does this CPU still need to do its part for current grace period? + * If no, return and let the other CPUs do their part as well. + */ + if (!rdp->qs_pending) + return; + + /* + * Was there a quiescent state since the beginning of the grace + * period? If no, then exit and wait for the next call. + */ + if (!rdp->passed_quiesce) + return; + + /* + * Tell RCU we are done (but rcu_report_qs_rdp() will be the + * judge of that). + */ + rcu_report_qs_rdp(rdp->cpu, rsp, rdp); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Send the specified CPU's RCU callbacks to the orphanage. The + * specified CPU must be offline, and the caller must hold the + * ->orphan_lock. + */ +static void +rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, + struct rcu_node *rnp, struct rcu_data *rdp) +{ + /* No-CBs CPUs do not have orphanable callbacks. */ + if (rcu_is_nocb_cpu(rdp->cpu)) + return; + + /* + * Orphan the callbacks. First adjust the counts. This is safe + * because _rcu_barrier() excludes CPU-hotplug operations, so it + * cannot be running now. Thus no memory barrier is required. + */ + if (rdp->nxtlist != NULL) { + rsp->qlen_lazy += rdp->qlen_lazy; + rsp->qlen += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; + rdp->qlen_lazy = 0; + ACCESS_ONCE(rdp->qlen) = 0; + } + + /* + * Next, move those callbacks still needing a grace period to + * the orphanage, where some other CPU will pick them up. + * Some of the callbacks might have gone partway through a grace + * period, but that is too bad. They get to start over because we + * cannot assume that grace periods are synchronized across CPUs. + * We don't bother updating the ->nxttail[] array yet, instead + * we just reset the whole thing later on. + */ + if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { + *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; + rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; + *rdp->nxttail[RCU_DONE_TAIL] = NULL; + } + + /* + * Then move the ready-to-invoke callbacks to the orphanage, + * where some other CPU will pick them up. These will not be + * required to pass though another grace period: They are done. + */ + if (rdp->nxtlist != NULL) { + *rsp->orphan_donetail = rdp->nxtlist; + rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; + } + + /* Finally, initialize the rcu_data structure's list to empty. */ + init_callback_list(rdp); +} + +/* + * Adopt the RCU callbacks from the specified rcu_state structure's + * orphanage. The caller must hold the ->orphan_lock. + */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ + int i; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + + /* No-CBs CPUs are handled specially. */ + if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) + return; + + /* Do the accounting first. */ + rdp->qlen_lazy += rsp->qlen_lazy; + rdp->qlen += rsp->qlen; + rdp->n_cbs_adopted += rsp->qlen; + if (rsp->qlen_lazy != rsp->qlen) + rcu_idle_count_callbacks_posted(); + rsp->qlen_lazy = 0; + rsp->qlen = 0; + + /* + * We do not need a memory barrier here because the only way we + * can get here if there is an rcu_barrier() in flight is if + * we are the task doing the rcu_barrier(). + */ + + /* First adopt the ready-to-invoke callbacks. */ + if (rsp->orphan_donelist != NULL) { + *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; + *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; + for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) + if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) + rdp->nxttail[i] = rsp->orphan_donetail; + rsp->orphan_donelist = NULL; + rsp->orphan_donetail = &rsp->orphan_donelist; + } + + /* And then adopt the callbacks that still need a grace period. */ + if (rsp->orphan_nxtlist != NULL) { + *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; + rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; + rsp->orphan_nxtlist = NULL; + rsp->orphan_nxttail = &rsp->orphan_nxtlist; + } +} + +/* + * Trace the fact that this CPU is going offline. + */ +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) +{ + RCU_TRACE(unsigned long mask); + RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); + RCU_TRACE(struct rcu_node *rnp = rdp->mynode); + + RCU_TRACE(mask = rdp->grpmask); + trace_rcu_grace_period(rsp->name, + rnp->gpnum + 1 - !!(rnp->qsmask & mask), + TPS("cpuofl")); +} + +/* + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup, + * including orphaning the outgoing CPU's RCU callbacks, and also + * adopting them. There can only be one CPU hotplug operation at a time, + * so no other CPU can be attempting to update rcu_cpu_kthread_task. + */ +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long mask; + int need_report = 0; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + /* Adjust any no-longer-needed kthreads. */ + rcu_boost_kthread_setaffinity(rnp, -1); + + /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ + + /* Exclude any attempts to start a new grace period. */ + mutex_lock(&rsp->onoff_mutex); + raw_spin_lock_irqsave(&rsp->orphan_lock, flags); + + /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ + rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); + rcu_adopt_orphan_cbs(rsp); + + /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ + mask = rdp->grpmask; /* rnp->grplo is constant. */ + do { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->qsmaskinit &= ~mask; + if (rnp->qsmaskinit != 0) { + if (rnp != rdp->mynode) + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + break; + } + if (rnp == rdp->mynode) + need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + else + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + mask = rnp->grpmask; + rnp = rnp->parent; + } while (rnp != NULL); + + /* + * We still hold the leaf rcu_node structure lock here, and + * irqs are still disabled. The reason for this subterfuge is + * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock + * held leads to deadlock. + */ + raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ + rnp = rdp->mynode; + if (need_report & RCU_OFL_TASKS_NORM_GP) + rcu_report_unblock_qs_rnp(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (need_report & RCU_OFL_TASKS_EXP_GP) + rcu_report_exp_rnp(rsp, rnp, true); + WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", + cpu, rdp->qlen, rdp->nxtlist); + init_callback_list(rdp); + /* Disallow further callbacks on this CPU. */ + rdp->nxttail[RCU_NEXT_TAIL] = NULL; + mutex_unlock(&rsp->onoff_mutex); +} + +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) +{ +} + +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) +{ +} + +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Invoke any RCU callbacks that have made it to the end of their grace + * period. Thottle as specified by rdp->blimit. + */ +static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_head *next, *list, **tail; + long bl, count, count_lazy; + int i; + + /* If no callbacks are ready, just return. */ + if (!cpu_has_callbacks_ready_to_invoke(rdp)) { + trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); + trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), + need_resched(), is_idle_task(current), + rcu_is_callbacks_kthread()); + return; + } + + /* + * Extract the list of ready callbacks, disabling to prevent + * races with call_rcu() from interrupt handlers. + */ + local_irq_save(flags); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); + bl = rdp->blimit; + trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); + list = rdp->nxtlist; + rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; + *rdp->nxttail[RCU_DONE_TAIL] = NULL; + tail = rdp->nxttail[RCU_DONE_TAIL]; + for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) + if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) + rdp->nxttail[i] = &rdp->nxtlist; + local_irq_restore(flags); + + /* Invoke callbacks. */ + count = count_lazy = 0; + while (list) { + next = list->next; + prefetch(next); + debug_rcu_head_unqueue(list); + if (__rcu_reclaim(rsp->name, list)) + count_lazy++; + list = next; + /* Stop only if limit reached and CPU has something to do. */ + if (++count >= bl && + (need_resched() || + (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) + break; + } + + local_irq_save(flags); + trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), + is_idle_task(current), + rcu_is_callbacks_kthread()); + + /* Update count, and requeue any remaining callbacks. */ + if (list != NULL) { + *tail = rdp->nxtlist; + rdp->nxtlist = list; + for (i = 0; i < RCU_NEXT_SIZE; i++) + if (&rdp->nxtlist == rdp->nxttail[i]) + rdp->nxttail[i] = tail; + else + break; + } + smp_mb(); /* List handling before counting for rcu_barrier(). */ + rdp->qlen_lazy -= count_lazy; + ACCESS_ONCE(rdp->qlen) -= count; + rdp->n_cbs_invoked += count; + + /* Reinstate batch limit if we have worked down the excess. */ + if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) + rdp->blimit = blimit; + + /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ + if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; + } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = rdp->qlen; + WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); + + local_irq_restore(flags); + + /* Re-invoke RCU core processing if there are callbacks remaining. */ + if (cpu_has_callbacks_ready_to_invoke(rdp)) + invoke_rcu_core(); +} + +/* + * Check to see if this CPU is in a non-context-switch quiescent state + * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). + * Also schedule RCU core processing. + * + * This function must be called from hardirq context. It is normally + * invoked from the scheduling-clock interrupt. If rcu_pending returns + * false, there is no point in invoking rcu_check_callbacks(). + */ +void rcu_check_callbacks(int cpu, int user) +{ + trace_rcu_utilization(TPS("Start scheduler-tick")); + increment_cpu_stall_ticks(); + if (user || rcu_is_cpu_rrupt_from_idle()) { + + /* + * Get here if this CPU took its interrupt from user + * mode or from the idle loop, and if this is not a + * nested interrupt. In this case, the CPU is in + * a quiescent state, so note it. + * + * No memory barrier is required here because both + * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local + * variables that other CPUs neither access nor modify, + * at least not while the corresponding CPU is online. + */ + + rcu_sched_qs(cpu); + rcu_bh_qs(cpu); + + } else if (!in_softirq()) { + + /* + * Get here if this CPU did not take its interrupt from + * softirq, in other words, if it is not interrupting + * a rcu_bh read-side critical section. This is an _bh + * critical section, so note it. + */ + + rcu_bh_qs(cpu); + } + rcu_preempt_check_callbacks(cpu); + if (rcu_pending(cpu)) + invoke_rcu_core(); + trace_rcu_utilization(TPS("End scheduler-tick")); +} + +/* + * Scan the leaf rcu_node structures, processing dyntick state for any that + * have not yet encountered a quiescent state, using the function specified. + * Also initiate boosting for any threads blocked on the root rcu_node. + * + * The caller must have suppressed start of new grace periods. + */ +static void force_qs_rnp(struct rcu_state *rsp, + int (*f)(struct rcu_data *rsp, bool *isidle, + unsigned long *maxj), + bool *isidle, unsigned long *maxj) +{ + unsigned long bit; + int cpu; + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rsp, rnp) { + cond_resched(); + mask = 0; + raw_spin_lock_irqsave(&rnp->lock, flags); + if (!rcu_gp_in_progress(rsp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + if (rnp->qsmask == 0) { + rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ + continue; + } + cpu = rnp->grplo; + bit = 1; + for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + if ((rnp->qsmask & bit) != 0) { + if ((rnp->qsmaskinit & bit) != 0) + *isidle = 0; + if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) + mask |= bit; + } + } + if (mask != 0) { + + /* rcu_report_qs_rnp() releases rnp->lock. */ + rcu_report_qs_rnp(mask, rsp, rnp, flags); + continue; + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } + rnp = rcu_get_root(rsp); + if (rnp->qsmask == 0) { + raw_spin_lock_irqsave(&rnp->lock, flags); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ + } +} + +/* + * Force quiescent states on reluctant CPUs, and also detect which + * CPUs are in dyntick-idle mode. + */ +static void force_quiescent_state(struct rcu_state *rsp) +{ + unsigned long flags; + bool ret; + struct rcu_node *rnp; + struct rcu_node *rnp_old = NULL; + + /* Funnel through hierarchy to reduce memory contention. */ + rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; + for (; rnp != NULL; rnp = rnp->parent) { + ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || + !raw_spin_trylock(&rnp->fqslock); + if (rnp_old != NULL) + raw_spin_unlock(&rnp_old->fqslock); + if (ret) { + rsp->n_force_qs_lh++; + return; + } + rnp_old = rnp; + } + /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ + + /* Reached the root of the rcu_node tree, acquire lock. */ + raw_spin_lock_irqsave(&rnp_old->lock, flags); + raw_spin_unlock(&rnp_old->fqslock); + if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { + rsp->n_force_qs_lh++; + raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + return; /* Someone beat us to it. */ + } + rsp->gp_flags |= RCU_GP_FLAG_FQS; + raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ +} + +/* + * This does the RCU core processing work for the specified rcu_state + * and rcu_data structures. This may be called only from the CPU to + * whom the rdp belongs. + */ +static void +__rcu_process_callbacks(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + + WARN_ON_ONCE(rdp->beenonline == 0); + + /* Update RCU state based on any recent quiescent states. */ + rcu_check_quiescent_state(rsp, rdp); + + /* Does this CPU require a not-yet-started grace period? */ + local_irq_save(flags); + if (cpu_needs_another_gp(rsp, rdp)) { + raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ + rcu_start_gp(rsp); + raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + } else { + local_irq_restore(flags); + } + + /* If there are callbacks ready, invoke them. */ + if (cpu_has_callbacks_ready_to_invoke(rdp)) + invoke_rcu_callbacks(rsp, rdp); +} + +/* + * Do RCU core processing for the current CPU. + */ +static void rcu_process_callbacks(struct softirq_action *unused) +{ + struct rcu_state *rsp; + + if (cpu_is_offline(smp_processor_id())) + return; + trace_rcu_utilization(TPS("Start RCU core")); + for_each_rcu_flavor(rsp) + __rcu_process_callbacks(rsp); + trace_rcu_utilization(TPS("End RCU core")); +} + +/* + * Schedule RCU callback invocation. If the specified type of RCU + * does not support RCU priority boosting, just do a direct call, + * otherwise wake up the per-CPU kernel kthread. Note that because we + * are running on the current CPU with interrupts disabled, the + * rcu_cpu_kthread_task cannot disappear out from under us. + */ +static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) +{ + if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) + return; + if (likely(!rsp->boost)) { + rcu_do_batch(rsp, rdp); + return; + } + invoke_rcu_callbacks_kthread(); +} + +static void invoke_rcu_core(void) +{ + if (cpu_online(smp_processor_id())) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Handle any core-RCU processing required by a call_rcu() invocation. + */ +static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, + struct rcu_head *head, unsigned long flags) +{ + /* + * If called from an extended quiescent state, invoke the RCU + * core in order to force a re-evaluation of RCU's idleness. + */ + if (!rcu_is_watching() && cpu_online(smp_processor_id())) + invoke_rcu_core(); + + /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ + if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) + return; + + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + + /* Are we ignoring a completed grace period? */ + note_gp_changes(rsp, rdp); + + /* Start a new grace period if one not already started. */ + if (!rcu_gp_in_progress(rsp)) { + struct rcu_node *rnp_root = rcu_get_root(rsp); + + raw_spin_lock(&rnp_root->lock); + rcu_start_gp(rsp); + raw_spin_unlock(&rnp_root->lock); + } else { + /* Give the grace period a kick. */ + rdp->blimit = LONG_MAX; + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; + } + } +} + +/* + * RCU callback function to leak a callback. + */ +static void rcu_leak_callback(struct rcu_head *rhp) +{ +} + +/* + * Helper function for call_rcu() and friends. The cpu argument will + * normally be -1, indicating "currently running CPU". It may specify + * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() + * is expected to specify a CPU. + */ +static void +__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), + struct rcu_state *rsp, int cpu, bool lazy) +{ + unsigned long flags; + struct rcu_data *rdp; + + WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ + if (debug_rcu_head_queue(head)) { + /* Probable double call_rcu(), so leak the callback. */ + ACCESS_ONCE(head->func) = rcu_leak_callback; + WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); + return; + } + head->func = func; + head->next = NULL; + + /* + * Opportunistically note grace-period endings and beginnings. + * Note that we might see a beginning right after we see an + * end, but never vice versa, since this CPU has to pass through + * a quiescent state betweentimes. + */ + local_irq_save(flags); + rdp = this_cpu_ptr(rsp->rda); + + /* Add the callback to our list. */ + if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { + int offline; + + if (cpu != -1) + rdp = per_cpu_ptr(rsp->rda, cpu); + offline = !__call_rcu_nocb(rdp, head, lazy); + WARN_ON_ONCE(offline); + /* _call_rcu() is illegal on offline CPU; leak the callback. */ + local_irq_restore(flags); + return; + } + ACCESS_ONCE(rdp->qlen)++; + if (lazy) + rdp->qlen_lazy++; + else + rcu_idle_count_callbacks_posted(); + smp_mb(); /* Count before adding callback for rcu_barrier(). */ + *rdp->nxttail[RCU_NEXT_TAIL] = head; + rdp->nxttail[RCU_NEXT_TAIL] = &head->next; + + if (__is_kfree_rcu_offset((unsigned long)func)) + trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, + rdp->qlen_lazy, rdp->qlen); + else + trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); + + /* Go handle any RCU core processing required. */ + __call_rcu_core(rsp, rdp, head, flags); + local_irq_restore(flags); +} + +/* + * Queue an RCU-sched callback for invocation after a grace period. + */ +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_sched_state, -1, 0); +} +EXPORT_SYMBOL_GPL(call_rcu_sched); + +/* + * Queue an RCU callback for invocation after a quicker grace period. + */ +void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_bh_state, -1, 0); +} +EXPORT_SYMBOL_GPL(call_rcu_bh); + +/* + * Because a context switch is a grace period for RCU-sched and RCU-bh, + * any blocking grace-period wait automatically implies a grace period + * if there is only one CPU online at any point time during execution + * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to + * occasionally incorrectly indicate that there are multiple CPUs online + * when there was in fact only one the whole time, as this just adds + * some overhead: RCU still operates correctly. + */ +static inline int rcu_blocking_is_gp(void) +{ + int ret; + + might_sleep(); /* Check for RCU read-side critical section. */ + preempt_disable(); + ret = num_online_cpus() <= 1; + preempt_enable(); + return ret; +} + +/** + * synchronize_sched - wait until an rcu-sched grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-sched + * grace period has elapsed, in other words after all currently executing + * rcu-sched read-side critical sections have completed. These read-side + * critical sections are delimited by rcu_read_lock_sched() and + * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), + * local_irq_disable(), and so on may be used in place of + * rcu_read_lock_sched(). + * + * This means that all preempt_disable code sequences, including NMI and + * non-threaded hardware-interrupt handlers, in progress on entry will + * have completed before this primitive returns. However, this does not + * guarantee that softirq handlers will have completed, since in some + * kernels, these handlers can run in process context, and can block. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_sched() returns, + * each CPU is guaranteed to have executed a full memory barrier since the + * end of its last RCU-sched read-side critical section whose beginning + * preceded the call to synchronize_sched(). In addition, each CPU having + * an RCU read-side critical section that extends beyond the return from + * synchronize_sched() is guaranteed to have executed a full memory barrier + * after the beginning of synchronize_sched() and before the beginning of + * that RCU read-side critical section. Note that these guarantees include + * CPUs that are offline, idle, or executing in user mode, as well as CPUs + * that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_sched(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + * + * This primitive provides the guarantees made by the (now removed) + * synchronize_kernel() API. In contrast, synchronize_rcu() only + * guarantees that rcu_read_lock() sections will have completed. + * In "classic RCU", these two guarantees happen to be one and + * the same, but can differ in realtime RCU implementations. + */ +void synchronize_sched(void) +{ + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU-sched read-side critical section"); + if (rcu_blocking_is_gp()) + return; + if (rcu_expedited) + synchronize_sched_expedited(); + else + wait_rcu_gp(call_rcu_sched); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + +/** + * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. + * + * Control will return to the caller some time after a full rcu_bh grace + * period has elapsed, in other words after all currently executing rcu_bh + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), + * and may be nested. + * + * See the description of synchronize_sched() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu_bh(void) +{ + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); + if (rcu_blocking_is_gp()) + return; + if (rcu_expedited) + synchronize_rcu_bh_expedited(); + else + wait_rcu_gp(call_rcu_bh); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); + +static int synchronize_sched_expedited_cpu_stop(void *data) +{ + /* + * There must be a full memory barrier on each affected CPU + * between the time that try_stop_cpus() is called and the + * time that it returns. + * + * In the current initial implementation of cpu_stop, the + * above condition is already met when the control reaches + * this point and the following smp_mb() is not strictly + * necessary. Do smp_mb() anyway for documentation and + * robustness against future implementation changes. + */ + smp_mb(); /* See above comment block. */ + return 0; +} + +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. + * + * This implementation can be thought of as an application of ticket + * locking to RCU, with sync_sched_expedited_started and + * sync_sched_expedited_done taking on the roles of the halves + * of the ticket-lock word. Each task atomically increments + * sync_sched_expedited_started upon entry, snapshotting the old value, + * then attempts to stop all the CPUs. If this succeeds, then each + * CPU will have executed a context switch, resulting in an RCU-sched + * grace period. We are then done, so we use atomic_cmpxchg() to + * update sync_sched_expedited_done to match our snapshot -- but + * only if someone else has not already advanced past our snapshot. + * + * On the other hand, if try_stop_cpus() fails, we check the value + * of sync_sched_expedited_done. If it has advanced past our + * initial snapshot, then someone else must have forced a grace period + * some time after we took our snapshot. In this case, our work is + * done for us, and we can simply return. Otherwise, we try again, + * but keep our initial snapshot for purposes of checking for someone + * doing our work for us. + * + * If we fail too many times in a row, we fall back to synchronize_sched(). + */ +void synchronize_sched_expedited(void) +{ + long firstsnap, s, snap; + int trycount = 0; + struct rcu_state *rsp = &rcu_sched_state; + + /* + * If we are in danger of counter wrap, just do synchronize_sched(). + * By allowing sync_sched_expedited_started to advance no more than + * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring + * that more than 3.5 billion CPUs would be required to force a + * counter wrap on a 32-bit system. Quite a few more CPUs would of + * course be required on a 64-bit system. + */ + if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), + (ulong)atomic_long_read(&rsp->expedited_done) + + ULONG_MAX / 8)) { + synchronize_sched(); + atomic_long_inc(&rsp->expedited_wrap); + return; + } + + /* + * Take a ticket. Note that atomic_inc_return() implies a + * full memory barrier. + */ + snap = atomic_long_inc_return(&rsp->expedited_start); + firstsnap = snap; + get_online_cpus(); + WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); + + /* + * Each pass through the following loop attempts to force a + * context switch on each CPU. + */ + while (try_stop_cpus(cpu_online_mask, + synchronize_sched_expedited_cpu_stop, + NULL) == -EAGAIN) { + put_online_cpus(); + atomic_long_inc(&rsp->expedited_tryfail); + + /* Check to see if someone else did our work for us. */ + s = atomic_long_read(&rsp->expedited_done); + if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { + /* ensure test happens before caller kfree */ + smp_mb__before_atomic_inc(); /* ^^^ */ + atomic_long_inc(&rsp->expedited_workdone1); + return; + } + + /* No joy, try again later. Or just synchronize_sched(). */ + if (trycount++ < 10) { + udelay(trycount * num_online_cpus()); + } else { + wait_rcu_gp(call_rcu_sched); + atomic_long_inc(&rsp->expedited_normal); + return; + } + + /* Recheck to see if someone else did our work for us. */ + s = atomic_long_read(&rsp->expedited_done); + if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { + /* ensure test happens before caller kfree */ + smp_mb__before_atomic_inc(); /* ^^^ */ + atomic_long_inc(&rsp->expedited_workdone2); + return; + } + + /* + * Refetching sync_sched_expedited_started allows later + * callers to piggyback on our grace period. We retry + * after they started, so our grace period works for them, + * and they started after our first try, so their grace + * period works for us. + */ + get_online_cpus(); + snap = atomic_long_read(&rsp->expedited_start); + smp_mb(); /* ensure read is before try_stop_cpus(). */ + } + atomic_long_inc(&rsp->expedited_stoppedcpus); + + /* + * Everyone up to our most recent fetch is covered by our grace + * period. Update the counter, but only if our work is still + * relevant -- which it won't be if someone who started later + * than we did already did their update. + */ + do { + atomic_long_inc(&rsp->expedited_done_tries); + s = atomic_long_read(&rsp->expedited_done); + if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { + /* ensure test happens before caller kfree */ + smp_mb__before_atomic_inc(); /* ^^^ */ + atomic_long_inc(&rsp->expedited_done_lost); + break; + } + } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); + atomic_long_inc(&rsp->expedited_done_exit); + + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +/* + * Check to see if there is any immediate RCU-related work to be done + * by the current CPU, for the specified type of RCU, returning 1 if so. + * The checks are in order of increasing expense: checks that can be + * carried out against CPU-local state are performed first. However, + * we must check for CPU stalls first, else we might not get a chance. + */ +static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) +{ + struct rcu_node *rnp = rdp->mynode; + + rdp->n_rcu_pending++; + + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rsp, rdp); + + /* Is the RCU core waiting for a quiescent state from this CPU? */ + if (rcu_scheduler_fully_active && + rdp->qs_pending && !rdp->passed_quiesce) { + rdp->n_rp_qs_pending++; + } else if (rdp->qs_pending && rdp->passed_quiesce) { + rdp->n_rp_report_qs++; + return 1; + } + + /* Does this CPU have callbacks ready to invoke? */ + if (cpu_has_callbacks_ready_to_invoke(rdp)) { + rdp->n_rp_cb_ready++; + return 1; + } + + /* Has RCU gone idle with this CPU needing another grace period? */ + if (cpu_needs_another_gp(rsp, rdp)) { + rdp->n_rp_cpu_needs_gp++; + return 1; + } + + /* Has another RCU grace period completed? */ + if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ + rdp->n_rp_gp_completed++; + return 1; + } + + /* Has a new RCU grace period started? */ + if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ + rdp->n_rp_gp_started++; + return 1; + } + + /* nothing to do */ + rdp->n_rp_need_nothing++; + return 0; +} + +/* + * Check to see if there is any immediate RCU-related work to be done + * by the current CPU, returning 1 if so. This function is part of the + * RCU implementation; it is -not- an exported member of the RCU API. + */ +static int rcu_pending(int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) + return 1; + return 0; +} + +/* + * Return true if the specified CPU has any callback. If all_lazy is + * non-NULL, store an indication of whether all callbacks are lazy. + * (If there are no callbacks, all of them are deemed to be lazy.) + */ +static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) +{ + bool al = true; + bool hc = false; + struct rcu_data *rdp; + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (!rdp->nxtlist) + continue; + hc = true; + if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { + al = false; + break; + } + } + if (all_lazy) + *all_lazy = al; + return hc; +} + +/* + * Helper function for _rcu_barrier() tracing. If tracing is disabled, + * the compiler is expected to optimize this away. + */ +static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, + int cpu, unsigned long done) +{ + trace_rcu_barrier(rsp->name, s, cpu, + atomic_read(&rsp->barrier_cpu_count), done); +} + +/* + * RCU callback function for _rcu_barrier(). If we are last, wake + * up the task executing _rcu_barrier(). + */ +static void rcu_barrier_callback(struct rcu_head *rhp) +{ + struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); + struct rcu_state *rsp = rdp->rsp; + + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { + _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); + complete(&rsp->barrier_completion); + } else { + _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); + } +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_func(void *type) +{ + struct rcu_state *rsp = type; + struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); + + _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + rsp->call(&rdp->barrier_head, rcu_barrier_callback); +} + +/* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. + */ +static void _rcu_barrier(struct rcu_state *rsp) +{ + int cpu; + struct rcu_data *rdp; + unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); + unsigned long snap_done; + + _rcu_barrier_trace(rsp, "Begin", -1, snap); + + /* Take mutex to serialize concurrent rcu_barrier() requests. */ + mutex_lock(&rsp->barrier_mutex); + + /* + * Ensure that all prior references, including to ->n_barrier_done, + * are ordered before the _rcu_barrier() machinery. + */ + smp_mb(); /* See above block comment. */ + + /* + * Recheck ->n_barrier_done to see if others did our work for us. + * This means checking ->n_barrier_done for an even-to-odd-to-even + * transition. The "if" expression below therefore rounds the old + * value up to the next even number and adds two before comparing. + */ + snap_done = rsp->n_barrier_done; + _rcu_barrier_trace(rsp, "Check", -1, snap_done); + + /* + * If the value in snap is odd, we needed to wait for the current + * rcu_barrier() to complete, then wait for the next one, in other + * words, we need the value of snap_done to be three larger than + * the value of snap. On the other hand, if the value in snap is + * even, we only had to wait for the next rcu_barrier() to complete, + * in other words, we need the value of snap_done to be only two + * greater than the value of snap. The "(snap + 3) & ~0x1" computes + * this for us (thank you, Linus!). + */ + if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { + _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); + smp_mb(); /* caller's subsequent code after above check. */ + mutex_unlock(&rsp->barrier_mutex); + return; + } + + /* + * Increment ->n_barrier_done to avoid duplicate work. Use + * ACCESS_ONCE() to prevent the compiler from speculating + * the increment to precede the early-exit check. + */ + ACCESS_ONCE(rsp->n_barrier_done)++; + WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); + _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); + smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ + + /* + * Initialize the count to one rather than to zero in order to + * avoid a too-soon return to zero in case of a short grace period + * (or preemption of this task). Exclude CPU-hotplug operations + * to ensure that no offline CPU has callbacks queued. + */ + init_completion(&rsp->barrier_completion); + atomic_set(&rsp->barrier_cpu_count, 1); + get_online_cpus(); + + /* + * Force each CPU with callbacks to register a new callback. + * When that callback is invoked, we will know that all of the + * corresponding CPU's preceding callbacks have been invoked. + */ + for_each_possible_cpu(cpu) { + if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) + continue; + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rcu_is_nocb_cpu(cpu)) { + _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + rsp->n_barrier_done); + atomic_inc(&rsp->barrier_cpu_count); + __call_rcu(&rdp->barrier_head, rcu_barrier_callback, + rsp, cpu, 0); + } else if (ACCESS_ONCE(rdp->qlen)) { + _rcu_barrier_trace(rsp, "OnlineQ", cpu, + rsp->n_barrier_done); + smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); + } else { + _rcu_barrier_trace(rsp, "OnlineNQ", cpu, + rsp->n_barrier_done); + } + } + put_online_cpus(); + + /* + * Now that we have an rcu_barrier_callback() callback on each + * CPU, and thus each counted, remove the initial count. + */ + if (atomic_dec_and_test(&rsp->barrier_cpu_count)) + complete(&rsp->barrier_completion); + + /* Increment ->n_barrier_done to prevent duplicate work. */ + smp_mb(); /* Keep increment after above mechanism. */ + ACCESS_ONCE(rsp->n_barrier_done)++; + WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); + _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); + smp_mb(); /* Keep increment before caller's subsequent code. */ + + /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ + wait_for_completion(&rsp->barrier_completion); + + /* Other rcu_barrier() invocations can now safely proceed. */ + mutex_unlock(&rsp->barrier_mutex); +} + +/** + * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. + */ +void rcu_barrier_bh(void) +{ + _rcu_barrier(&rcu_bh_state); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +/** + * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. + */ +void rcu_barrier_sched(void) +{ + _rcu_barrier(&rcu_sched_state); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + +/* + * Do boot-time initialization of a CPU's per-CPU RCU data. + */ +static void __init +rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Set up local state, ensuring consistent view of global state. */ + raw_spin_lock_irqsave(&rnp->lock, flags); + rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); + init_callback_list(rdp); + rdp->qlen_lazy = 0; + ACCESS_ONCE(rdp->qlen) = 0; + rdp->dynticks = &per_cpu(rcu_dynticks, cpu); + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); + WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); + rdp->cpu = cpu; + rdp->rsp = rsp; + rcu_boot_init_nocb_percpu_data(rdp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Initialize a CPU's per-CPU RCU data. Note that only one online or + * offline event can be happening at a given time. Note also that we + * can accept some slop in the rsp->completed access due to the fact + * that this CPU cannot possibly have any RCU callbacks in flight yet. + */ +static void +rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Exclude new grace periods. */ + mutex_lock(&rsp->onoff_mutex); + + /* Set up local state, ensuring consistent view of global state. */ + raw_spin_lock_irqsave(&rnp->lock, flags); + rdp->beenonline = 1; /* We have now been online. */ + rdp->preemptible = preemptible; + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->blimit = blimit; + init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ + rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rcu_sysidle_init_percpu_data(rdp->dynticks); + atomic_set(&rdp->dynticks->dynticks, + (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + + /* Add CPU to rcu_node bitmasks. */ + rnp = rdp->mynode; + mask = rdp->grpmask; + do { + /* Exclude any attempts to start a new GP on small systems. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->qsmaskinit |= mask; + mask = rnp->grpmask; + if (rnp == rdp->mynode) { + /* + * If there is a grace period in progress, we will + * set up to wait for it next time we run the + * RCU core code. + */ + rdp->gpnum = rnp->completed; + rdp->completed = rnp->completed; + rdp->passed_quiesce = 0; + rdp->qs_pending = 0; + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); + } + raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ + rnp = rnp->parent; + } while (rnp != NULL && !(rnp->qsmaskinit & mask)); + local_irq_restore(flags); + + mutex_unlock(&rsp->onoff_mutex); +} + +static void rcu_prepare_cpu(int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_init_percpu_data(cpu, rsp, + strcmp(rsp->name, "rcu_preempt") == 0); +} + +/* + * Handle CPU online/offline notification events. + */ +static int rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_node *rnp = rdp->mynode; + struct rcu_state *rsp; + + trace_rcu_utilization(TPS("Start CPU hotplug")); + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + rcu_prepare_cpu(cpu); + rcu_prepare_kthreads(cpu); + break; + case CPU_ONLINE: + case CPU_DOWN_FAILED: + rcu_boost_kthread_setaffinity(rnp, -1); + break; + case CPU_DOWN_PREPARE: + rcu_boost_kthread_setaffinity(rnp, cpu); + break; + case CPU_DYING: + case CPU_DYING_FROZEN: + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_cpu(rsp); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + for_each_rcu_flavor(rsp) + rcu_cleanup_dead_cpu(cpu, rsp); + break; + default: + break; + } + trace_rcu_utilization(TPS("End CPU hotplug")); + return NOTIFY_OK; +} + +static int rcu_pm_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ + rcu_expedited = 1; + break; + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + rcu_expedited = 0; + break; + default: + break; + } + return NOTIFY_OK; +} + +/* + * Spawn the kthread that handles this RCU flavor's grace periods. + */ +static int __init rcu_spawn_gp_kthread(void) +{ + unsigned long flags; + struct rcu_node *rnp; + struct rcu_state *rsp; + struct task_struct *t; + + for_each_rcu_flavor(rsp) { + t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); + BUG_ON(IS_ERR(t)); + rnp = rcu_get_root(rsp); + raw_spin_lock_irqsave(&rnp->lock, flags); + rsp->gp_kthread = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_spawn_nocb_kthreads(rsp); + } + return 0; +} +early_initcall(rcu_spawn_gp_kthread); + +/* + * This function is invoked towards the end of the scheduler's initialization + * process. Before this is called, the idle task might contain + * RCU read-side critical sections (during which time, this idle + * task is booting the system). After this function is called, the + * idle tasks are prohibited from containing RCU read-side critical + * sections. This function also enables RCU lockdep checking. + */ +void rcu_scheduler_starting(void) +{ + WARN_ON(num_online_cpus() != 1); + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} + +/* + * Compute the per-level fanout, either using the exact fanout specified + * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. + */ +#ifdef CONFIG_RCU_FANOUT_EXACT +static void __init rcu_init_levelspread(struct rcu_state *rsp) +{ + int i; + + for (i = rcu_num_lvls - 1; i > 0; i--) + rsp->levelspread[i] = CONFIG_RCU_FANOUT; + rsp->levelspread[0] = rcu_fanout_leaf; +} +#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ +static void __init rcu_init_levelspread(struct rcu_state *rsp) +{ + int ccur; + int cprv; + int i; + + cprv = nr_cpu_ids; + for (i = rcu_num_lvls - 1; i >= 0; i--) { + ccur = rsp->levelcnt[i]; + rsp->levelspread[i] = (cprv + ccur - 1) / ccur; + cprv = ccur; + } +} +#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */ + +/* + * Helper function for rcu_init() that initializes one rcu_state structure. + */ +static void __init rcu_init_one(struct rcu_state *rsp, + struct rcu_data __percpu *rda) +{ + static char *buf[] = { "rcu_node_0", + "rcu_node_1", + "rcu_node_2", + "rcu_node_3" }; /* Match MAX_RCU_LVLS */ + static char *fqs[] = { "rcu_node_fqs_0", + "rcu_node_fqs_1", + "rcu_node_fqs_2", + "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + int cpustride = 1; + int i; + int j; + struct rcu_node *rnp; + + BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ + + /* Silence gcc 4.8 warning about array index out of range. */ + if (rcu_num_lvls > RCU_NUM_LVLS) + panic("rcu_init_one: rcu_num_lvls overflow"); + + /* Initialize the level-tracking arrays. */ + + for (i = 0; i < rcu_num_lvls; i++) + rsp->levelcnt[i] = num_rcu_lvl[i]; + for (i = 1; i < rcu_num_lvls; i++) + rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; + rcu_init_levelspread(rsp); + + /* Initialize the elements themselves, starting from the leaves. */ + + for (i = rcu_num_lvls - 1; i >= 0; i--) { + cpustride *= rsp->levelspread[i]; + rnp = rsp->level[i]; + for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { + raw_spin_lock_init(&rnp->lock); + lockdep_set_class_and_name(&rnp->lock, + &rcu_node_class[i], buf[i]); + raw_spin_lock_init(&rnp->fqslock); + lockdep_set_class_and_name(&rnp->fqslock, + &rcu_fqs_class[i], fqs[i]); + rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; + rnp->qsmask = 0; + rnp->qsmaskinit = 0; + rnp->grplo = j * cpustride; + rnp->grphi = (j + 1) * cpustride - 1; + if (rnp->grphi >= NR_CPUS) + rnp->grphi = NR_CPUS - 1; + if (i == 0) { + rnp->grpnum = 0; + rnp->grpmask = 0; + rnp->parent = NULL; + } else { + rnp->grpnum = j % rsp->levelspread[i - 1]; + rnp->grpmask = 1UL << rnp->grpnum; + rnp->parent = rsp->level[i - 1] + + j / rsp->levelspread[i - 1]; + } + rnp->level = i; + INIT_LIST_HEAD(&rnp->blkd_tasks); + rcu_init_one_nocb(rnp); + } + } + + rsp->rda = rda; + init_waitqueue_head(&rsp->gp_wq); + init_irq_work(&rsp->wakeup_work, rsp_wakeup); + rnp = rsp->level[rcu_num_lvls - 1]; + for_each_possible_cpu(i) { + while (i > rnp->grphi) + rnp++; + per_cpu_ptr(rsp->rda, i)->mynode = rnp; + rcu_boot_init_percpu_data(i, rsp); + } + list_add(&rsp->flavors, &rcu_struct_flavors); +} + +/* + * Compute the rcu_node tree geometry from kernel parameters. This cannot + * replace the definitions in tree.h because those are needed to size + * the ->node array in the rcu_state structure. + */ +static void __init rcu_init_geometry(void) +{ + ulong d; + int i; + int j; + int n = nr_cpu_ids; + int rcu_capacity[MAX_RCU_LVLS + 1]; + + /* + * Initialize any unspecified boot parameters. + * The default values of jiffies_till_first_fqs and + * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS + * value, which is a function of HZ, then adding one for each + * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. + */ + d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; + if (jiffies_till_first_fqs == ULONG_MAX) + jiffies_till_first_fqs = d; + if (jiffies_till_next_fqs == ULONG_MAX) + jiffies_till_next_fqs = d; + + /* If the compile-time values are accurate, just leave. */ + if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && + nr_cpu_ids == NR_CPUS) + return; + + /* + * Compute number of nodes that can be handled an rcu_node tree + * with the given number of levels. Setting rcu_capacity[0] makes + * some of the arithmetic easier. + */ + rcu_capacity[0] = 1; + rcu_capacity[1] = rcu_fanout_leaf; + for (i = 2; i <= MAX_RCU_LVLS; i++) + rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; + + /* + * The boot-time rcu_fanout_leaf parameter is only permitted + * to increase the leaf-level fanout, not decrease it. Of course, + * the leaf-level fanout cannot exceed the number of bits in + * the rcu_node masks. Finally, the tree must be able to accommodate + * the configured number of CPUs. Complain and fall back to the + * compile-time values if these limits are exceeded. + */ + if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || + rcu_fanout_leaf > sizeof(unsigned long) * 8 || + n > rcu_capacity[MAX_RCU_LVLS]) { + WARN_ON(1); + return; + } + + /* Calculate the number of rcu_nodes at each level of the tree. */ + for (i = 1; i <= MAX_RCU_LVLS; i++) + if (n <= rcu_capacity[i]) { + for (j = 0; j <= i; j++) + num_rcu_lvl[j] = + DIV_ROUND_UP(n, rcu_capacity[i - j]); + rcu_num_lvls = i; + for (j = i + 1; j <= MAX_RCU_LVLS; j++) + num_rcu_lvl[j] = 0; + break; + } + + /* Calculate the total number of rcu_node structures. */ + rcu_num_nodes = 0; + for (i = 0; i <= MAX_RCU_LVLS; i++) + rcu_num_nodes += num_rcu_lvl[i]; + rcu_num_nodes -= n; +} + +void __init rcu_init(void) +{ + int cpu; + + rcu_bootup_announce(); + rcu_init_geometry(); + rcu_init_one(&rcu_bh_state, &rcu_bh_data); + rcu_init_one(&rcu_sched_state, &rcu_sched_data); + __rcu_init_preempt(); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + + /* + * We don't need protection against CPU-hotplug here because + * this is called early in boot, before either interrupts + * or the scheduler are operational. + */ + cpu_notifier(rcu_cpu_notify, 0); + pm_notifier(rcu_pm_notify, 0); + for_each_online_cpu(cpu) + rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); +} + +#include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h new file mode 100644 index 000000000000..52be957c9fe2 --- /dev/null +++ b/kernel/rcu/tree.h @@ -0,0 +1,585 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Author: Ingo Molnar + * Paul E. McKenney + */ + +#include +#include +#include +#include +#include +#include + +/* + * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and + * CONFIG_RCU_FANOUT_LEAF. + * In theory, it should be possible to add more levels straightforwardly. + * In practice, this did work well going from three levels to four. + * Of course, your mileage may vary. + */ +#define MAX_RCU_LVLS 4 +#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) +#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) +#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) +#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT_1 +# define RCU_NUM_LVLS 1 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (NR_CPUS) +# define NUM_RCU_LVL_2 0 +# define NUM_RCU_LVL_3 0 +# define NUM_RCU_LVL_4 0 +#elif NR_CPUS <= RCU_FANOUT_2 +# define RCU_NUM_LVLS 2 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_LVL_2 (NR_CPUS) +# define NUM_RCU_LVL_3 0 +# define NUM_RCU_LVL_4 0 +#elif NR_CPUS <= RCU_FANOUT_3 +# define RCU_NUM_LVLS 3 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_LVL_3 (NR_CPUS) +# define NUM_RCU_LVL_4 0 +#elif NR_CPUS <= RCU_FANOUT_4 +# define RCU_NUM_LVLS 4 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_LVL_4 (NR_CPUS) +#else +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ + +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) +#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) + +extern int rcu_num_lvls; +extern int rcu_num_nodes; + +/* + * Dynticks per-CPU state. + */ +struct rcu_dynticks { + long long dynticks_nesting; /* Track irq/process nesting level. */ + /* Process level is worth LLONG_MAX/2. */ + int dynticks_nmi_nesting; /* Track NMI nesting level. */ + atomic_t dynticks; /* Even value for idle, else odd. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + long long dynticks_idle_nesting; + /* irq/process nesting level from idle. */ + atomic_t dynticks_idle; /* Even value for idle, else odd. */ + /* "Idle" excludes userspace execution. */ + unsigned long dynticks_idle_jiffies; + /* End of last non-NMI non-idle period. */ +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ +#ifdef CONFIG_RCU_FAST_NO_HZ + bool all_lazy; /* Are all CPU's CBs lazy? */ + unsigned long nonlazy_posted; + /* # times non-lazy CBs posted to CPU. */ + unsigned long nonlazy_posted_snap; + /* idle-period nonlazy_posted snapshot. */ + unsigned long last_accelerate; + /* Last jiffy CBs were accelerated. */ + unsigned long last_advance_all; + /* Last jiffy CBs were all advanced. */ + int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ +}; + +/* RCU's kthread states for tracing. */ +#define RCU_KTHREAD_STOPPED 0 +#define RCU_KTHREAD_RUNNING 1 +#define RCU_KTHREAD_WAITING 2 +#define RCU_KTHREAD_OFFCPU 3 +#define RCU_KTHREAD_YIELDING 4 +#define RCU_KTHREAD_MAX 4 + +/* + * Definition for node within the RCU grace-period-detection hierarchy. + */ +struct rcu_node { + raw_spinlock_t lock; /* Root rcu_node's lock protects some */ + /* rcu_state fields as well as following. */ + unsigned long gpnum; /* Current grace period for this node. */ + /* This will either be equal to or one */ + /* behind the root rcu_node's gpnum. */ + unsigned long completed; /* Last GP completed for this node. */ + /* This will either be equal to or one */ + /* behind the root rcu_node's gpnum. */ + unsigned long qsmask; /* CPUs or groups that need to switch in */ + /* order for current grace period to proceed.*/ + /* In leaf rcu_node, each bit corresponds to */ + /* an rcu_data structure, otherwise, each */ + /* bit corresponds to a child rcu_node */ + /* structure. */ + unsigned long expmask; /* Groups that have ->blkd_tasks */ + /* elements that need to drain to allow the */ + /* current expedited grace period to */ + /* complete (only for TREE_PREEMPT_RCU). */ + unsigned long qsmaskinit; + /* Per-GP initial value for qsmask & expmask. */ + unsigned long grpmask; /* Mask to apply to parent qsmask. */ + /* Only one bit will be set in this mask. */ + int grplo; /* lowest-numbered CPU or group here. */ + int grphi; /* highest-numbered CPU or group here. */ + u8 grpnum; /* CPU/group number for next level up. */ + u8 level; /* root is at level 0. */ + struct rcu_node *parent; + struct list_head blkd_tasks; + /* Tasks blocked in RCU read-side critical */ + /* section. Tasks are placed at the head */ + /* of this list and age towards the tail. */ + struct list_head *gp_tasks; + /* Pointer to the first task blocking the */ + /* current grace period, or NULL if there */ + /* is no such task. */ + struct list_head *exp_tasks; + /* Pointer to the first task blocking the */ + /* current expedited grace period, or NULL */ + /* if there is no such task. If there */ + /* is no current expedited grace period, */ + /* then there can cannot be any such task. */ +#ifdef CONFIG_RCU_BOOST + struct list_head *boost_tasks; + /* Pointer to first task that needs to be */ + /* priority boosted, or NULL if no priority */ + /* boosting is needed for this rcu_node */ + /* structure. If there are no tasks */ + /* queued on this rcu_node structure that */ + /* are blocking the current grace period, */ + /* there can be no such task. */ + unsigned long boost_time; + /* When to start boosting (jiffies). */ + struct task_struct *boost_kthread_task; + /* kthread that takes care of priority */ + /* boosting for this rcu_node structure. */ + unsigned int boost_kthread_status; + /* State of boost_kthread_task for tracing. */ + unsigned long n_tasks_boosted; + /* Total number of tasks boosted. */ + unsigned long n_exp_boosts; + /* Number of tasks boosted for expedited GP. */ + unsigned long n_normal_boosts; + /* Number of tasks boosted for normal GP. */ + unsigned long n_balk_blkd_tasks; + /* Refused to boost: no blocked tasks. */ + unsigned long n_balk_exp_gp_tasks; + /* Refused to boost: nothing blocking GP. */ + unsigned long n_balk_boost_tasks; + /* Refused to boost: already boosting. */ + unsigned long n_balk_notblocked; + /* Refused to boost: RCU RS CS still running. */ + unsigned long n_balk_notyet; + /* Refused to boost: not yet time. */ + unsigned long n_balk_nos; + /* Refused to boost: not sure why, though. */ + /* This can happen due to race conditions. */ +#endif /* #ifdef CONFIG_RCU_BOOST */ +#ifdef CONFIG_RCU_NOCB_CPU + wait_queue_head_t nocb_gp_wq[2]; + /* Place for rcu_nocb_kthread() to wait GP. */ +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + int need_future_gp[2]; + /* Counts of upcoming no-CB GP requests. */ + raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; +} ____cacheline_internodealigned_in_smp; + +/* + * Do a full breadth-first scan of the rcu_node structures for the + * specified rcu_state structure. + */ +#define rcu_for_each_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) + +/* + * Do a breadth-first scan of the non-leaf rcu_node structures for the + * specified rcu_state structure. Note that if there is a singleton + * rcu_node tree with but one rcu_node structure, this loop is a no-op. + */ +#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) + +/* + * Scan the leaves of the rcu_node hierarchy for the specified rcu_state + * structure. Note that if there is a singleton rcu_node tree with but + * one rcu_node structure, this loop -will- visit the rcu_node structure. + * It is still a leaf node, even if it is also the root node. + */ +#define rcu_for_each_leaf_node(rsp, rnp) \ + for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) + +/* Index values for nxttail array in struct rcu_data. */ +#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ +#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ +#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ +#define RCU_NEXT_TAIL 3 +#define RCU_NEXT_SIZE 4 + +/* Per-CPU data for read-copy update. */ +struct rcu_data { + /* 1) quiescent-state and grace-period handling : */ + unsigned long completed; /* Track rsp->completed gp number */ + /* in order to detect GP end. */ + unsigned long gpnum; /* Highest gp number that this CPU */ + /* is aware of having started. */ + bool passed_quiesce; /* User-mode/idle loop etc. */ + bool qs_pending; /* Core waits for quiesc state. */ + bool beenonline; /* CPU online at least once. */ + bool preemptible; /* Preemptible RCU? */ + struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ + unsigned long grpmask; /* Mask to apply to leaf qsmask. */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + unsigned long ticks_this_gp; /* The number of scheduling-clock */ + /* ticks this CPU has handled */ + /* during and after the last grace */ + /* period it is aware of. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + + /* 2) batch handling */ + /* + * If nxtlist is not NULL, it is partitioned as follows. + * Any of the partitions might be empty, in which case the + * pointer to that partition will be equal to the pointer for + * the following partition. When the list is empty, all of + * the nxttail elements point to the ->nxtlist pointer itself, + * which in that case is NULL. + * + * [nxtlist, *nxttail[RCU_DONE_TAIL]): + * Entries that batch # <= ->completed + * The grace period for these entries has completed, and + * the other grace-period-completed entries may be moved + * here temporarily in rcu_process_callbacks(). + * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): + * Entries that batch # <= ->completed - 1: waiting for current GP + * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): + * Entries known to have arrived before current GP ended + * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): + * Entries that might have arrived after current GP ended + * Note that the value of *nxttail[RCU_NEXT_TAIL] will + * always be NULL, as this is the end of the list. + */ + struct rcu_head *nxtlist; + struct rcu_head **nxttail[RCU_NEXT_SIZE]; + unsigned long nxtcompleted[RCU_NEXT_SIZE]; + /* grace periods for sublists. */ + long qlen_lazy; /* # of lazy queued callbacks */ + long qlen; /* # of queued callbacks, incl lazy */ + long qlen_last_fqs_check; + /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ + unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ + unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ + unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ + unsigned long n_force_qs_snap; + /* did other CPU force QS recently? */ + long blimit; /* Upper limit on a processed batch */ + + /* 3) dynticks interface. */ + struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ + int dynticks_snap; /* Per-GP tracking for dynticks. */ + + /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ + unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ + unsigned long offline_fqs; /* Kicked due to being offline. */ + + /* 5) __rcu_pending() statistics. */ + unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ + unsigned long n_rp_qs_pending; + unsigned long n_rp_report_qs; + unsigned long n_rp_cb_ready; + unsigned long n_rp_cpu_needs_gp; + unsigned long n_rp_gp_completed; + unsigned long n_rp_gp_started; + unsigned long n_rp_need_nothing; + + /* 6) _rcu_barrier() and OOM callbacks. */ + struct rcu_head barrier_head; +#ifdef CONFIG_RCU_FAST_NO_HZ + struct rcu_head oom_head; +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + + /* 7) Callback offloading. */ +#ifdef CONFIG_RCU_NOCB_CPU + struct rcu_head *nocb_head; /* CBs waiting for kthread. */ + struct rcu_head **nocb_tail; + atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ + atomic_long_t nocb_q_count_lazy; /* (approximate). */ + int nocb_p_count; /* # CBs being invoked by kthread */ + int nocb_p_count_lazy; /* (approximate). */ + wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ + struct task_struct *nocb_kthread; +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + + /* 8) RCU CPU stall data. */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + unsigned int softirq_snap; /* Snapshot of softirq activity. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + + int cpu; + struct rcu_state *rsp; +}; + +/* Values for fqs_state field in struct rcu_state. */ +#define RCU_GP_IDLE 0 /* No grace period in progress. */ +#define RCU_GP_INIT 1 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ +#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ +#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK + +#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) + /* For jiffies_till_first_fqs and */ + /* and jiffies_till_next_fqs. */ + +#define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */ + /* delay between bouts of */ + /* quiescent-state forcing. */ + +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */ + /* at least one scheduling clock */ + /* irq before ratting on them. */ + +#define rcu_wait(cond) \ +do { \ + for (;;) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (cond) \ + break; \ + schedule(); \ + } \ + __set_current_state(TASK_RUNNING); \ +} while (0) + +/* + * RCU global state, including node hierarchy. This hierarchy is + * represented in "heap" form in a dense array. The root (first level) + * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second + * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]), + * and the third level in ->node[m+1] and following (->node[m+1] referenced + * by ->level[2]). The number of levels is determined by the number of + * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy" + * consisting of a single rcu_node. + */ +struct rcu_state { + struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ + struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ + u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ + u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ + struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ + void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ + void (*func)(struct rcu_head *head)); + + /* The following fields are guarded by the root rcu_node's lock. */ + + u8 fqs_state ____cacheline_internodealigned_in_smp; + /* Force QS state. */ + u8 boost; /* Subject to priority boost. */ + unsigned long gpnum; /* Current gp number. */ + unsigned long completed; /* # of last completed gp. */ + struct task_struct *gp_kthread; /* Task for grace periods. */ + wait_queue_head_t gp_wq; /* Where GP task waits. */ + int gp_flags; /* Commands for GP task. */ + + /* End of fields guarded by root rcu_node's lock. */ + + raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; + /* Protect following fields. */ + struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ + /* need a grace period. */ + struct rcu_head **orphan_nxttail; /* Tail of above. */ + struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ + /* are ready to invoke. */ + struct rcu_head **orphan_donetail; /* Tail of above. */ + long qlen_lazy; /* Number of lazy callbacks. */ + long qlen; /* Total number of callbacks. */ + /* End of fields guarded by orphan_lock. */ + + struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ + + struct mutex barrier_mutex; /* Guards barrier fields. */ + atomic_t barrier_cpu_count; /* # CPUs waiting on. */ + struct completion barrier_completion; /* Wake at barrier end. */ + unsigned long n_barrier_done; /* ++ at start and end of */ + /* _rcu_barrier(). */ + /* End of fields guarded by barrier_mutex. */ + + atomic_long_t expedited_start; /* Starting ticket. */ + atomic_long_t expedited_done; /* Done ticket. */ + atomic_long_t expedited_wrap; /* # near-wrap incidents. */ + atomic_long_t expedited_tryfail; /* # acquisition failures. */ + atomic_long_t expedited_workdone1; /* # done by others #1. */ + atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_normal; /* # fallbacks to normal. */ + atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ + atomic_long_t expedited_done_tries; /* # tries to update _done. */ + atomic_long_t expedited_done_lost; /* # times beaten to _done. */ + atomic_long_t expedited_done_exit; /* # times exited _done loop. */ + + unsigned long jiffies_force_qs; /* Time at which to invoke */ + /* force_quiescent_state(). */ + unsigned long n_force_qs; /* Number of calls to */ + /* force_quiescent_state(). */ + unsigned long n_force_qs_lh; /* ~Number of calls leaving */ + /* due to lock unavailable. */ + unsigned long n_force_qs_ngp; /* Number of calls leaving */ + /* due to no GP active. */ + unsigned long gp_start; /* Time at which GP started, */ + /* but in jiffies. */ + unsigned long jiffies_stall; /* Time at which to check */ + /* for CPU stalls. */ + unsigned long gp_max; /* Maximum GP duration in */ + /* jiffies. */ + const char *name; /* Name of structure. */ + char abbr; /* Abbreviated name. */ + struct list_head flavors; /* List of RCU flavors. */ + struct irq_work wakeup_work; /* Postponed wakeups */ +}; + +/* Values for rcu_state structure's gp_flags field. */ +#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ +#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ + +extern struct list_head rcu_struct_flavors; + +/* Sequence through rcu_state structures for each RCU flavor. */ +#define for_each_rcu_flavor(rsp) \ + list_for_each_entry((rsp), &rcu_struct_flavors, flavors) + +/* Return values for rcu_preempt_offline_tasks(). */ + +#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ + /* GP were moved to root. */ +#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ + /* GP were moved to root. */ + +/* + * RCU implementation internal declarations: + */ +extern struct rcu_state rcu_sched_state; +DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); + +extern struct rcu_state rcu_bh_state; +DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); + +#ifdef CONFIG_TREE_PREEMPT_RCU +extern struct rcu_state rcu_preempt_state; +DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +#ifdef CONFIG_RCU_BOOST +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DECLARE_PER_CPU(char, rcu_cpu_has_work); +#endif /* #ifdef CONFIG_RCU_BOOST */ + +#ifndef RCU_TREE_NONCORE + +/* Forward declarations for rcutree_plugin.h */ +static void rcu_bootup_announce(void); +long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); +#ifdef CONFIG_HOTPLUG_CPU +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, + unsigned long flags); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_print_detail_task_stall(struct rcu_state *rsp); +static int rcu_print_task_stall(struct rcu_node *rnp); +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +#ifdef CONFIG_HOTPLUG_CPU +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_preempt_check_callbacks(int cpu); +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake); +#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ +static void __init __rcu_init_preempt(void); +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); +static void invoke_rcu_callbacks_kthread(void); +static bool rcu_is_callbacks_kthread(void); +#ifdef CONFIG_RCU_BOOST +static void rcu_preempt_do_callbacks(void); +static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp); +#endif /* #ifdef CONFIG_RCU_BOOST */ +static void rcu_prepare_kthreads(int cpu); +static void rcu_cleanup_after_idle(int cpu); +static void rcu_prepare_for_idle(int cpu); +static void rcu_idle_count_callbacks_posted(void); +static void print_cpu_stall_info_begin(void); +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); +static void print_cpu_stall_info_end(void); +static void zero_cpu_stall_ticks(struct rcu_data *rdp); +static void increment_cpu_stall_ticks(void); +static int rcu_nocb_needs_gp(struct rcu_state *rsp); +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static void rcu_init_one_nocb(struct rcu_node *rnp); +static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, + bool lazy); +static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, + struct rcu_data *rdp); +static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); +static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); +static void rcu_kick_nohz_cpu(int cpu); +static bool init_nocb_callback_list(struct rcu_data *rdp); +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj); +static bool is_sysidle_rcu_state(struct rcu_state *rsp); +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj); +static void rcu_bind_gp_kthread(void); +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); + +#endif /* #ifndef RCU_TREE_NONCORE */ + +#ifdef CONFIG_RCU_TRACE +#ifdef CONFIG_RCU_NOCB_CPU +/* Sum up queue lengths for tracing. */ +static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) +{ + *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; + *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; +} +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ +static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) +{ + *ql = 0; + *qll = 0; +} +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h new file mode 100644 index 000000000000..3822ac0c4b27 --- /dev/null +++ b/kernel/rcu/tree_plugin.h @@ -0,0 +1,2831 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions that provide either classic + * or preemptible semantics. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright Red Hat, 2009 + * Copyright IBM Corporation, 2009 + * + * Author: Ingo Molnar + * Paul E. McKenney + */ + +#include +#include +#include +#include +#include "../time/tick-internal.h" + +#define RCU_KTHREAD_PRIO 1 + +#ifdef CONFIG_RCU_BOOST +#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO +#else +#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO +#endif + +#ifdef CONFIG_RCU_NOCB_CPU +static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ +static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ +static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ +static char __initdata nocb_buf[NR_CPUS * 5]; +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + +/* + * Check the RCU kernel configuration parameters and print informative + * messages about anything out of the ordinary. If you like #ifdef, you + * will love this function. + */ +static void __init rcu_bootup_announce_oddness(void) +{ +#ifdef CONFIG_RCU_TRACE + pr_info("\tRCU debugfs-based tracing is enabled.\n"); +#endif +#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) + pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", + CONFIG_RCU_FANOUT); +#endif +#ifdef CONFIG_RCU_FANOUT_EXACT + pr_info("\tHierarchical RCU autobalancing is disabled.\n"); +#endif +#ifdef CONFIG_RCU_FAST_NO_HZ + pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); +#endif +#ifdef CONFIG_PROVE_RCU + pr_info("\tRCU lockdep checking is enabled.\n"); +#endif +#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE + pr_info("\tRCU torture testing starts during boot.\n"); +#endif +#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) + pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); +#endif +#if defined(CONFIG_RCU_CPU_STALL_INFO) + pr_info("\tAdditional per-CPU info printed with stalls.\n"); +#endif +#if NUM_RCU_LVL_4 != 0 + pr_info("\tFour-level hierarchy is enabled.\n"); +#endif + if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) + pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); + if (nr_cpu_ids != NR_CPUS) + pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); +#ifdef CONFIG_RCU_NOCB_CPU +#ifndef CONFIG_RCU_NOCB_CPU_NONE + if (!have_rcu_nocb_mask) { + zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); + have_rcu_nocb_mask = true; + } +#ifdef CONFIG_RCU_NOCB_CPU_ZERO + pr_info("\tOffload RCU callbacks from CPU 0\n"); + cpumask_set_cpu(0, rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ +#ifdef CONFIG_RCU_NOCB_CPU_ALL + pr_info("\tOffload RCU callbacks from all CPUs\n"); + cpumask_copy(rcu_nocb_mask, cpu_possible_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ + if (have_rcu_nocb_mask) { + if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { + pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); + cpumask_and(rcu_nocb_mask, cpu_possible_mask, + rcu_nocb_mask); + } + cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); + pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); + if (rcu_nocb_poll) + pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); + } +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ +} + +#ifdef CONFIG_TREE_PREEMPT_RCU + +RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); +static struct rcu_state *rcu_state = &rcu_preempt_state; + +static int rcu_preempted_readers_exp(struct rcu_node *rnp); + +/* + * Tell them what RCU they are running. + */ +static void __init rcu_bootup_announce(void) +{ + pr_info("Preemptible hierarchical RCU implementation.\n"); + rcu_bootup_announce_oddness(); +} + +/* + * Return the number of RCU-preempt batches processed thus far + * for debug and statistics. + */ +long rcu_batches_completed_preempt(void) +{ + return rcu_preempt_state.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); + +/* + * Return the number of RCU batches processed thus far for debug & stats. + */ +long rcu_batches_completed(void) +{ + return rcu_batches_completed_preempt(); +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Force a quiescent state for preemptible RCU. + */ +void rcu_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_preempt_state); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + +/* + * Record a preemptible-RCU quiescent state for the specified CPU. Note + * that this just means that the task currently running on the CPU is + * not in a quiescent state. There might be any number of tasks blocked + * while in an RCU read-side critical section. + * + * Unlike the other rcu_*_qs() functions, callers to this function + * must disable irqs in order to protect the assignment to + * ->rcu_read_unlock_special. + */ +static void rcu_preempt_qs(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); + + if (rdp->passed_quiesce == 0) + trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); + rdp->passed_quiesce = 1; + current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; +} + +/* + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the blkd_tasks list. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the blkd_tasks list entries + * predating the current grace period drain, in other words, until + * rnp->gp_tasks becomes NULL. + * + * Caller must disable preemption. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ + struct task_struct *t = current; + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + + if (t->rcu_read_lock_nesting > 0 && + (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + + /* Possibly blocking in an RCU read-side critical section. */ + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave(&rnp->lock, flags); + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + t->rcu_blocked_node = rnp; + + /* + * If this CPU has already checked in, then this task + * will hold up the next grace period rather than the + * current grace period. Queue the task accordingly. + * If the task is queued for the current grace period + * (i.e., this CPU has not yet passed through a quiescent + * state for the current grace period), then as long + * as that task remains queued, the current grace period + * cannot end. Note that there is some uncertainty as + * to exactly when the current grace period started. + * We take a conservative approach, which can result + * in unnecessarily waiting on tasks that started very + * slightly after the current grace period began. C'est + * la vie!!! + * + * But first, note that the current CPU must still be + * on line! + */ + WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); + WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); + if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { + list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); + rnp->gp_tasks = &t->rcu_node_entry; +#ifdef CONFIG_RCU_BOOST + if (rnp->boost_tasks != NULL) + rnp->boost_tasks = rnp->gp_tasks; +#endif /* #ifdef CONFIG_RCU_BOOST */ + } else { + list_add(&t->rcu_node_entry, &rnp->blkd_tasks); + if (rnp->qsmask & rdp->grpmask) + rnp->gp_tasks = &t->rcu_node_entry; + } + trace_rcu_preempt_task(rdp->rsp->name, + t->pid, + (rnp->qsmask & rdp->grpmask) + ? rnp->gpnum + : rnp->gpnum + 1); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } else if (t->rcu_read_lock_nesting < 0 && + t->rcu_read_unlock_special) { + + /* + * Complete exit from RCU read-side critical section on + * behalf of preempted instance of __rcu_read_unlock(). + */ + rcu_read_unlock_special(t); + } + + /* + * Either we were not in an RCU read-side critical section to + * begin with, or we have now recorded that critical section + * globally. Either way, we can now note a quiescent state + * for this CPU. Again, if we were in an RCU read-side critical + * section, and if that critical section was blocking the current + * grace period, then the fact that the task has been enqueued + * means that we continue to block the current grace period. + */ + local_irq_save(flags); + rcu_preempt_qs(cpu); + local_irq_restore(flags); +} + +/* + * Check for preempted RCU readers blocking the current grace period + * for the specified rcu_node structure. If the caller needs a reliable + * answer, it must hold the rcu_node's ->lock. + */ +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) +{ + return rnp->gp_tasks != NULL; +} + +/* + * Record a quiescent state for all tasks that were previously queued + * on the specified rcu_node structure and that were blocking the current + * RCU grace period. The caller must hold the specified rnp->lock with + * irqs disabled, and this lock is released upon return, but irqs remain + * disabled. + */ +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + struct rcu_node *rnp_p; + + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; /* Still need more quiescent states! */ + } + + rnp_p = rnp->parent; + if (rnp_p == NULL) { + /* + * Either there is only one rcu_node in the tree, + * or tasks were kicked up to root rcu_node due to + * CPUs going offline. + */ + rcu_report_qs_rsp(&rcu_preempt_state, flags); + return; + } + + /* Report up the rest of the hierarchy. */ + mask = rnp->grpmask; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ + rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); +} + +/* + * Advance a ->blkd_tasks-list pointer to the next entry, instead + * returning NULL if at the end of the list. + */ +static struct list_head *rcu_next_node_entry(struct task_struct *t, + struct rcu_node *rnp) +{ + struct list_head *np; + + np = t->rcu_node_entry.next; + if (np == &rnp->blkd_tasks) + np = NULL; + return np; +} + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ +void rcu_read_unlock_special(struct task_struct *t) +{ + int empty; + int empty_exp; + int empty_exp_now; + unsigned long flags; + struct list_head *np; +#ifdef CONFIG_RCU_BOOST + struct rt_mutex *rbmp = NULL; +#endif /* #ifdef CONFIG_RCU_BOOST */ + struct rcu_node *rnp; + int special; + + /* NMI handlers cannot block and cannot safely manipulate state. */ + if (in_nmi()) + return; + + local_irq_save(flags); + + /* + * If RCU core is waiting for this CPU to exit critical section, + * let it know that we have done so. + */ + special = t->rcu_read_unlock_special; + if (special & RCU_READ_UNLOCK_NEED_QS) { + rcu_preempt_qs(smp_processor_id()); + } + + /* Hardware IRQ handlers cannot block. */ + if (in_irq() || in_serving_softirq()) { + local_irq_restore(flags); + return; + } + + /* Clean up if blocked during RCU read-side critical section. */ + if (special & RCU_READ_UNLOCK_BLOCKED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + + /* + * Remove this task from the list it blocked on. The + * task can migrate while we acquire the lock, but at + * most one time. So at most two passes through loop. + */ + for (;;) { + rnp = t->rcu_blocked_node; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + if (rnp == t->rcu_blocked_node) + break; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + empty = !rcu_preempt_blocked_readers_cgp(rnp); + empty_exp = !rcu_preempted_readers_exp(rnp); + smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ + np = rcu_next_node_entry(t, rnp); + list_del_init(&t->rcu_node_entry); + t->rcu_blocked_node = NULL; + trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), + rnp->gpnum, t->pid); + if (&t->rcu_node_entry == rnp->gp_tasks) + rnp->gp_tasks = np; + if (&t->rcu_node_entry == rnp->exp_tasks) + rnp->exp_tasks = np; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; + /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ + if (t->rcu_boost_mutex) { + rbmp = t->rcu_boost_mutex; + t->rcu_boost_mutex = NULL; + } +#endif /* #ifdef CONFIG_RCU_BOOST */ + + /* + * If this was the last task on the current list, and if + * we aren't waiting on any CPUs, report the quiescent state. + * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, + * so we must take a snapshot of the expedited state. + */ + empty_exp_now = !rcu_preempted_readers_exp(rnp); + if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { + trace_rcu_quiescent_state_report(TPS("preempt_rcu"), + rnp->gpnum, + 0, rnp->qsmask, + rnp->level, + rnp->grplo, + rnp->grphi, + !!rnp->gp_tasks); + rcu_report_unblock_qs_rnp(rnp, flags); + } else { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } + +#ifdef CONFIG_RCU_BOOST + /* Unboost if we were boosted. */ + if (rbmp) + rt_mutex_unlock(rbmp); +#endif /* #ifdef CONFIG_RCU_BOOST */ + + /* + * If this was the last task on the expedited lists, + * then we need to report up the rcu_node hierarchy. + */ + if (!empty_exp && empty_exp_now) + rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); + } else { + local_irq_restore(flags); + } +} + +#ifdef CONFIG_RCU_CPU_STALL_VERBOSE + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period on the specified rcu_node structure. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ + unsigned long flags; + struct task_struct *t; + + raw_spin_lock_irqsave(&rnp->lock, flags); + if (!rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + t = list_entry(rnp->gp_tasks, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + sched_show_task(t); + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period. + */ +static void rcu_print_detail_task_stall(struct rcu_state *rsp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + rcu_print_detail_task_stall_rnp(rnp); + rcu_for_each_leaf_node(rsp, rnp) + rcu_print_detail_task_stall_rnp(rnp); +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ + +static void rcu_print_detail_task_stall(struct rcu_state *rsp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ + +#ifdef CONFIG_RCU_CPU_STALL_INFO + +static void rcu_print_task_stall_begin(struct rcu_node *rnp) +{ + pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + rnp->level, rnp->grplo, rnp->grphi); +} + +static void rcu_print_task_stall_end(void) +{ + pr_cont("\n"); +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +static void rcu_print_task_stall_begin(struct rcu_node *rnp) +{ +} + +static void rcu_print_task_stall_end(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each. + */ +static int rcu_print_task_stall(struct rcu_node *rnp) +{ + struct task_struct *t; + int ndetected = 0; + + if (!rcu_preempt_blocked_readers_cgp(rnp)) + return 0; + rcu_print_task_stall_begin(rnp); + t = list_entry(rnp->gp_tasks, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + pr_cont(" P%d", t->pid); + ndetected++; + } + rcu_print_task_stall_end(); + return ndetected; +} + +/* + * Check that the list of blocked tasks for the newly completed grace + * period is in fact empty. It is a serious bug to complete a grace + * period that still has RCU readers blocked! This function must be + * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock + * must be held by the caller. + * + * Also, if there are blocked tasks on the list, they automatically + * block the newly created grace period, so set up ->gp_tasks accordingly. + */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +{ + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + if (!list_empty(&rnp->blkd_tasks)) + rnp->gp_tasks = rnp->blkd_tasks.next; + WARN_ON_ONCE(rnp->qsmask); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Handle tasklist migration for case in which all CPUs covered by the + * specified rcu_node have gone offline. Move them up to the root + * rcu_node. The reason for not just moving them to the immediate + * parent is to remove the need for rcu_read_unlock_special() to + * make more than two attempts to acquire the target rcu_node's lock. + * Returns true if there were tasks blocking the current RCU grace + * period. + * + * Returns 1 if there was previously a task blocking the current grace + * period on the specified rcu_node structure. + * + * The caller must hold rnp->lock with irqs disabled. + */ +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) +{ + struct list_head *lp; + struct list_head *lp_root; + int retval = 0; + struct rcu_node *rnp_root = rcu_get_root(rsp); + struct task_struct *t; + + if (rnp == rnp_root) { + WARN_ONCE(1, "Last CPU thought to be offlined?"); + return 0; /* Shouldn't happen: at least one CPU online. */ + } + + /* If we are on an internal node, complain bitterly. */ + WARN_ON_ONCE(rnp != rdp->mynode); + + /* + * Move tasks up to root rcu_node. Don't try to get fancy for + * this corner-case operation -- just put this node's tasks + * at the head of the root node's list, and update the root node's + * ->gp_tasks and ->exp_tasks pointers to those of this node's, + * if non-NULL. This might result in waiting for more tasks than + * absolutely necessary, but this is a good performance/complexity + * tradeoff. + */ + if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) + retval |= RCU_OFL_TASKS_NORM_GP; + if (rcu_preempted_readers_exp(rnp)) + retval |= RCU_OFL_TASKS_EXP_GP; + lp = &rnp->blkd_tasks; + lp_root = &rnp_root->blkd_tasks; + while (!list_empty(lp)) { + t = list_entry(lp->next, typeof(*t), rcu_node_entry); + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + list_del(&t->rcu_node_entry); + t->rcu_blocked_node = rnp_root; + list_add(&t->rcu_node_entry, lp_root); + if (&t->rcu_node_entry == rnp->gp_tasks) + rnp_root->gp_tasks = rnp->gp_tasks; + if (&t->rcu_node_entry == rnp->exp_tasks) + rnp_root->exp_tasks = rnp->exp_tasks; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp_root->boost_tasks = rnp->boost_tasks; +#endif /* #ifdef CONFIG_RCU_BOOST */ + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ + } + + rnp->gp_tasks = NULL; + rnp->exp_tasks = NULL; +#ifdef CONFIG_RCU_BOOST + rnp->boost_tasks = NULL; + /* + * In case root is being boosted and leaf was not. Make sure + * that we boost the tasks blocking the current grace period + * in this case. + */ + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + if (rnp_root->boost_tasks != NULL && + rnp_root->boost_tasks != rnp_root->gp_tasks && + rnp_root->boost_tasks != rnp_root->exp_tasks) + rnp_root->boost_tasks = rnp_root->gp_tasks; + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ +#endif /* #ifdef CONFIG_RCU_BOOST */ + + return retval; +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Check for a quiescent state from the current CPU. When a task blocks, + * the task is recorded in the corresponding CPU's rcu_node structure, + * which is checked elsewhere. + * + * Caller must disable hard irqs. + */ +static void rcu_preempt_check_callbacks(int cpu) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) { + rcu_preempt_qs(cpu); + return; + } + if (t->rcu_read_lock_nesting > 0 && + per_cpu(rcu_preempt_data, cpu).qs_pending) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; +} + +#ifdef CONFIG_RCU_BOOST + +static void rcu_preempt_do_callbacks(void) +{ + rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); +} + +#endif /* #ifdef CONFIG_RCU_BOOST */ + +/* + * Queue a preemptible-RCU callback for invocation after a grace period. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_preempt_state, -1, 0); +} +EXPORT_SYMBOL_GPL(call_rcu); + +/* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + */ +void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_preempt_state, -1, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + +/** + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. + * + * See the description of synchronize_sched() for more detailed information + * on memory ordering guarantees. + */ +void synchronize_rcu(void) +{ + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); + if (!rcu_scheduler_active) + return; + if (rcu_expedited) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); +static unsigned long sync_rcu_preempt_exp_count; +static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); + +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(struct rcu_node *rnp) +{ + return rnp->exp_tasks != NULL; +} + +/* + * return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold sync_rcu_preempt_exp_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return !rcu_preempted_readers_exp(rnp) && + ACCESS_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Most callers will set the "wake" flag, but the task initiating the + * expedited grace period need not wake itself. + * + * Caller must hold sync_rcu_preempt_exp_mutex. + */ +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake) +{ + unsigned long flags; + unsigned long mask; + + raw_spin_lock_irqsave(&rnp->lock, flags); + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + break; + } + if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (wake) + wake_up(&sync_rcu_preempt_exp_wq); + break; + } + mask = rnp->grpmask; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + rnp = rnp->parent; + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + rnp->expmask &= ~mask; + } +} + +/* + * Snapshot the tasks blocking the newly started preemptible-RCU expedited + * grace period for the specified rcu_node structure. If there are no such + * tasks, report it up the rcu_node hierarchy. + * + * Caller must hold sync_rcu_preempt_exp_mutex and must exclude + * CPU hotplug operations. + */ +static void +sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) +{ + unsigned long flags; + int must_wait = 0; + + raw_spin_lock_irqsave(&rnp->lock, flags); + if (list_empty(&rnp->blkd_tasks)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } else { + rnp->exp_tasks = rnp->blkd_tasks.next; + rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ + must_wait = 1; + } + if (!must_wait) + rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ +} + +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU-preempt grace period, but expedite it. The basic + * idea is to invoke synchronize_sched_expedited() to push all the tasks to + * the ->blkd_tasks lists and wait for this list to drain. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. + * In fact, if you are using synchronize_rcu_expedited() in a loop, + * please restructure your code to batch your updates, and then Use a + * single synchronize_rcu() instead. + * + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. + */ +void synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_preempt_state; + unsigned long snap; + int trycount = 0; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; + smp_mb(); /* Above access cannot bleed into critical section. */ + + /* + * Block CPU-hotplug operations. This means that any CPU-hotplug + * operation that finds an rcu_node structure with tasks in the + * process of being boosted will know that all tasks blocking + * this expedited grace period will already be in the process of + * being boosted. This simplifies the process of moving tasks + * from leaf to root rcu_node structures. + */ + get_online_cpus(); + + /* + * Acquire lock, falling back to synchronize_rcu() if too many + * lock-acquisition failures. Of course, if someone does the + * expedited grace period for us, just leave. + */ + while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { + if (ULONG_CMP_LT(snap, + ACCESS_ONCE(sync_rcu_preempt_exp_count))) { + put_online_cpus(); + goto mb_ret; /* Others did our work for us. */ + } + if (trycount++ < 10) { + udelay(trycount * num_online_cpus()); + } else { + put_online_cpus(); + wait_rcu_gp(call_rcu); + return; + } + } + if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { + put_online_cpus(); + goto unlock_mb_ret; /* Others did our work for us. */ + } + + /* force all RCU readers onto ->blkd_tasks lists. */ + synchronize_sched_expedited(); + + /* Initialize ->expmask for all non-leaf rcu_node structures. */ + rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->expmask = rnp->qsmaskinit; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } + + /* Snapshot current state of ->blkd_tasks lists. */ + rcu_for_each_leaf_node(rsp, rnp) + sync_rcu_preempt_exp_init(rsp, rnp); + if (NUM_RCU_NODES > 1) + sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); + + put_online_cpus(); + + /* Wait for snapshotted ->blkd_tasks lists to drain. */ + rnp = rcu_get_root(rsp); + wait_event(sync_rcu_preempt_exp_wq, + sync_rcu_preempt_exp_done(rnp)); + + /* Clean up and exit. */ + smp_mb(); /* ensure expedited GP seen before counter increment. */ + ACCESS_ONCE(sync_rcu_preempt_exp_count)++; +unlock_mb_ret: + mutex_unlock(&sync_rcu_preempt_exp_mutex); +mb_ret: + smp_mb(); /* ensure subsequent action seen after grace period. */ +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + * + * Note that this primitive does not necessarily wait for an RCU grace period + * to complete. For example, if there are no RCU callbacks queued anywhere + * in the system, then rcu_barrier() is within its rights to return + * immediately, without waiting for anything, much less an RCU grace period. + */ +void rcu_barrier(void) +{ + _rcu_barrier(&rcu_preempt_state); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* + * Initialize preemptible RCU's state structures. + */ +static void __init __rcu_init_preempt(void) +{ + rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); +} + +/* + * Check for a task exiting while in a preemptible-RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (likely(list_empty(¤t->rcu_node_entry))) + return; + t->rcu_read_lock_nesting = 1; + barrier(); + t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; + __rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +static struct rcu_state *rcu_state = &rcu_sched_state; + +/* + * Tell them what RCU they are running. + */ +static void __init rcu_bootup_announce(void) +{ + pr_info("Hierarchical RCU implementation.\n"); + rcu_bootup_announce_oddness(); +} + +/* + * Return the number of RCU batches processed thus far for debug & stats. + */ +long rcu_batches_completed(void) +{ + return rcu_batches_completed_sched(); +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Force a quiescent state for RCU, which, because there is no preemptible + * RCU, becomes the same as rcu-sched. + */ +void rcu_force_quiescent_state(void) +{ + rcu_sched_force_quiescent_state(); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + +/* + * Because preemptible RCU does not exist, we never have to check for + * CPUs being in quiescent states. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ +} + +/* + * Because preemptible RCU does not exist, there are never any preempted + * RCU readers. + */ +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) +{ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* Because preemptible RCU does not exist, no quieting of tasks. */ +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) +{ + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static void rcu_print_detail_task_stall(struct rcu_state *rsp) +{ +} + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static int rcu_print_task_stall(struct rcu_node *rnp) +{ + return 0; +} + +/* + * Because there is no preemptible RCU, there can be no readers blocked, + * so there is no need to check for blocked tasks. So check only for + * bogus qsmask values. + */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +{ + WARN_ON_ONCE(rnp->qsmask); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Because preemptible RCU does not exist, it never needs to migrate + * tasks that were blocked within RCU read-side critical sections, and + * such non-existent tasks cannot possibly have been blocking the current + * grace period. + */ +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) +{ + return 0; +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to check. + */ +static void rcu_preempt_check_callbacks(int cpu) +{ +} + +/* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + * + * Because there is no preemptible RCU, we use RCU-sched instead. + */ +void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_sched_state, -1, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + +/* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptible RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Because preemptible RCU does not exist, there is never any need to + * report on tasks preempted in RCU read-side critical sections during + * expedited RCU grace periods. + */ +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake) +{ +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Because preemptible RCU does not exist, rcu_barrier() is just + * another name for rcu_barrier_sched(). + */ +void rcu_barrier(void) +{ + rcu_barrier_sched(); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* + * Because preemptible RCU does not exist, it need not be initialized. + */ +static void __init __rcu_init_preempt(void) +{ +} + +/* + * Because preemptible RCU does not exist, tasks cannot possibly exit + * while in preemptible RCU read-side critical sections. + */ +void exit_rcu(void) +{ +} + +#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ + +#ifdef CONFIG_RCU_BOOST + +#include "../rtmutex_common.h" + +#ifdef CONFIG_RCU_TRACE + +static void rcu_initiate_boost_trace(struct rcu_node *rnp) +{ + if (list_empty(&rnp->blkd_tasks)) + rnp->n_balk_blkd_tasks++; + else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) + rnp->n_balk_exp_gp_tasks++; + else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) + rnp->n_balk_boost_tasks++; + else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) + rnp->n_balk_notblocked++; + else if (rnp->gp_tasks != NULL && + ULONG_CMP_LT(jiffies, rnp->boost_time)) + rnp->n_balk_notyet++; + else + rnp->n_balk_nos++; +} + +#else /* #ifdef CONFIG_RCU_TRACE */ + +static void rcu_initiate_boost_trace(struct rcu_node *rnp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) + wake_up_process(t); +} + +/* + * Carry out RCU priority boosting on the task indicated by ->exp_tasks + * or ->boost_tasks, advancing the pointer to the next task in the + * ->blkd_tasks list. + * + * Note that irqs must be enabled: boosting the task can block. + * Returns 1 if there are more tasks needing to be boosted. + */ +static int rcu_boost(struct rcu_node *rnp) +{ + unsigned long flags; + struct rt_mutex mtx; + struct task_struct *t; + struct list_head *tb; + + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) + return 0; /* Nothing left to boost. */ + + raw_spin_lock_irqsave(&rnp->lock, flags); + + /* + * Recheck under the lock: all tasks in need of boosting + * might exit their RCU read-side critical sections on their own. + */ + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return 0; + } + + /* + * Preferentially boost tasks blocking expedited grace periods. + * This cannot starve the normal grace periods because a second + * expedited grace period must boost all blocked tasks, including + * those blocking the pre-existing normal grace period. + */ + if (rnp->exp_tasks != NULL) { + tb = rnp->exp_tasks; + rnp->n_exp_boosts++; + } else { + tb = rnp->boost_tasks; + rnp->n_normal_boosts++; + } + rnp->n_tasks_boosted++; + + /* + * We boost task t by manufacturing an rt_mutex that appears to + * be held by task t. We leave a pointer to that rt_mutex where + * task t can find it, and task t will release the mutex when it + * exits its outermost RCU read-side critical section. Then + * simply acquiring this artificial rt_mutex will boost task + * t's priority. (Thanks to tglx for suggesting this approach!) + * + * Note that task t must acquire rnp->lock to remove itself from + * the ->blkd_tasks list, which it will do from exit() if from + * nowhere else. We therefore are guaranteed that task t will + * stay around at least until we drop rnp->lock. Note that + * rnp->lock also resolves races between our priority boosting + * and task t's exiting its outermost RCU read-side critical + * section. + */ + t = container_of(tb, struct task_struct, rcu_node_entry); + rt_mutex_init_proxy_locked(&mtx, t); + t->rcu_boost_mutex = &mtx; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ + rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + + return ACCESS_ONCE(rnp->exp_tasks) != NULL || + ACCESS_ONCE(rnp->boost_tasks) != NULL; +} + +/* + * Priority-boosting kthread. One per leaf rcu_node and one for the + * root rcu_node. + */ +static int rcu_boost_kthread(void *arg) +{ + struct rcu_node *rnp = (struct rcu_node *)arg; + int spincnt = 0; + int more2boost; + + trace_rcu_utilization(TPS("Start boost kthread@init")); + for (;;) { + rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); + rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); + rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; + more2boost = rcu_boost(rnp); + if (more2boost) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; + trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); + schedule_timeout_interruptible(2); + trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); + spincnt = 0; + } + } + /* NOTREACHED */ + trace_rcu_utilization(TPS("End boost kthread@notreached")); + return 0; +} + +/* + * Check to see if it is time to start boosting RCU readers that are + * blocking the current grace period, and, if so, tell the per-rcu_node + * kthread to start boosting them. If there is an expedited grace + * period in progress, it is always time to boost. + * + * The caller must hold rnp->lock, which this function releases. + * The ->boost_kthread_task is immortal, so we don't need to worry + * about it going away. + */ +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) +{ + struct task_struct *t; + + if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + rnp->n_balk_exp_gp_tasks++; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + if (rnp->exp_tasks != NULL || + (rnp->gp_tasks != NULL && + rnp->boost_tasks == NULL && + rnp->qsmask == 0 && + ULONG_CMP_GE(jiffies, rnp->boost_time))) { + if (rnp->exp_tasks == NULL) + rnp->boost_tasks = rnp->gp_tasks; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + t = rnp->boost_kthread_task; + if (t) + rcu_wake_cond(t, rnp->boost_kthread_status); + } else { + rcu_initiate_boost_trace(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +/* + * Wake up the per-CPU kthread to invoke RCU callbacks. + */ +static void invoke_rcu_callbacks_kthread(void) +{ + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_cpu_has_work, 1); + if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && + current != __this_cpu_read(rcu_cpu_kthread_task)) { + rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), + __this_cpu_read(rcu_cpu_kthread_status)); + } + local_irq_restore(flags); +} + +/* + * Is the current CPU running the RCU-callbacks kthread? + * Caller must have preemption disabled. + */ +static bool rcu_is_callbacks_kthread(void) +{ + return __this_cpu_read(rcu_cpu_kthread_task) == current; +} + +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) + +/* + * Do priority-boost accounting for the start of a new grace period. + */ +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) +{ + rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; +} + +/* + * Create an RCU-boost kthread for the specified node if one does not + * already exist. We only create this kthread for preemptible RCU. + * Returns zero if all is well, a negated errno otherwise. + */ +static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + int rnp_index = rnp - &rsp->node[0]; + unsigned long flags; + struct sched_param sp; + struct task_struct *t; + + if (&rcu_preempt_state != rsp) + return 0; + + if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) + return 0; + + rsp->boost = 1; + if (rnp->boost_kthread_task != NULL) + return 0; + t = kthread_create(rcu_boost_kthread, (void *)rnp, + "rcub/%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->boost_kthread_task = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + sp.sched_priority = RCU_BOOST_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ + return 0; +} + +static void rcu_kthread_do_work(void) +{ + rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); + rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); + rcu_preempt_do_callbacks(); +} + +static void rcu_cpu_kthread_setup(unsigned int cpu) +{ + struct sched_param sp; + + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); +} + +static void rcu_cpu_kthread_park(unsigned int cpu) +{ + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; +} + +static int rcu_cpu_kthread_should_run(unsigned int cpu) +{ + return __this_cpu_read(rcu_cpu_has_work); +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the + * RCU softirq used in flavors and configurations of RCU that do not + * support RCU priority boosting. + */ +static void rcu_cpu_kthread(unsigned int cpu) +{ + unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); + int spincnt; + + for (spincnt = 0; spincnt < 10; spincnt++) { + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); + local_bh_disable(); + *statusp = RCU_KTHREAD_RUNNING; + this_cpu_inc(rcu_cpu_kthread_loops); + local_irq_disable(); + work = *workp; + *workp = 0; + local_irq_enable(); + if (work) + rcu_kthread_do_work(); + local_bh_enable(); + if (*workp == 0) { + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); + *statusp = RCU_KTHREAD_WAITING; + return; + } + } + *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); + schedule_timeout_interruptible(2); + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); + *statusp = RCU_KTHREAD_WAITING; +} + +/* + * Set the per-rcu_node kthread's affinity to cover all CPUs that are + * served by the rcu_node in question. The CPU hotplug lock is still + * held, so the value of rnp->qsmaskinit will be stable. + * + * We don't include outgoingcpu in the affinity set, use -1 if there is + * no outgoing CPU. If there are no CPUs left in the affinity set, + * this function allows the kthread to execute on any CPU. + */ +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +{ + struct task_struct *t = rnp->boost_kthread_task; + unsigned long mask = rnp->qsmaskinit; + cpumask_var_t cm; + int cpu; + + if (!t) + return; + if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) + return; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) + if ((mask & 0x1) && cpu != outgoingcpu) + cpumask_set_cpu(cpu, cm); + if (cpumask_weight(cm) == 0) { + cpumask_setall(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) + cpumask_clear_cpu(cpu, cm); + WARN_ON_ONCE(cpumask_weight(cm) == 0); + } + set_cpus_allowed_ptr(t, cm); + free_cpumask_var(cm); +} + +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_cpu_kthread_task, + .thread_should_run = rcu_cpu_kthread_should_run, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; + +/* + * Spawn all kthreads -- called as soon as the scheduler is running. + */ +static int __init rcu_spawn_kthreads(void) +{ + struct rcu_node *rnp; + int cpu; + + rcu_scheduler_fully_active = 1; + for_each_possible_cpu(cpu) + per_cpu(rcu_cpu_has_work, cpu) = 0; + BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); + rnp = rcu_get_root(rcu_state); + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); + if (NUM_RCU_NODES > 1) { + rcu_for_each_leaf_node(rcu_state, rnp) + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); + } + return 0; +} +early_initcall(rcu_spawn_kthreads); + +static void rcu_prepare_kthreads(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_node *rnp = rdp->mynode; + + /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ + if (rcu_scheduler_fully_active) + (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); +} + +#else /* #ifdef CONFIG_RCU_BOOST */ + +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) +{ + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +static void invoke_rcu_callbacks_kthread(void) +{ + WARN_ON_ONCE(1); +} + +static bool rcu_is_callbacks_kthread(void) +{ + return false; +} + +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) +{ +} + +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +{ +} + +static int __init rcu_scheduler_really_started(void) +{ + rcu_scheduler_fully_active = 1; + return 0; +} +early_initcall(rcu_scheduler_really_started); + +static void rcu_prepare_kthreads(int cpu) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + +#if !defined(CONFIG_RCU_FAST_NO_HZ) + +/* + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. This function is part of the RCU implementation; it is -not- + * an exported member of the RCU API. + * + * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs + * any flavor of RCU. + */ +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +{ + *delta_jiffies = ULONG_MAX; + return rcu_cpu_has_callbacks(cpu, NULL); +} + +/* + * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up + * after it. + */ +static void rcu_cleanup_after_idle(int cpu) +{ +} + +/* + * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, + * is nothing. + */ +static void rcu_prepare_for_idle(int cpu) +{ +} + +/* + * Don't bother keeping a running count of the number of RCU callbacks + * posted because CONFIG_RCU_FAST_NO_HZ=n. + */ +static void rcu_idle_count_callbacks_posted(void) +{ +} + +#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ + +/* + * This code is invoked when a CPU goes idle, at which point we want + * to have the CPU do everything required for RCU so that it can enter + * the energy-efficient dyntick-idle mode. This is handled by a + * state machine implemented by rcu_prepare_for_idle() below. + * + * The following three proprocessor symbols control this state machine: + * + * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted + * to sleep in dyntick-idle mode with RCU callbacks pending. This + * is sized to be roughly one RCU grace period. Those energy-efficiency + * benchmarkers who might otherwise be tempted to set this to a large + * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your + * system. And if you are -that- concerned about energy efficiency, + * just power the system down and be done with it! + * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is + * permitted to sleep in dyntick-idle mode with only lazy RCU + * callbacks pending. Setting this too high can OOM your system. + * + * The values below work well in practice. If future workloads require + * adjustment, they can be converted into kernel config parameters, though + * making the state machine smarter might be a better option. + */ +#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ +#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ + +static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; +module_param(rcu_idle_gp_delay, int, 0644); +static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; +module_param(rcu_idle_lazy_gp_delay, int, 0644); + +extern int tick_nohz_enabled; + +/* + * Try to advance callbacks for all flavors of RCU on the current CPU, but + * only if it has been awhile since the last time we did so. Afterwards, + * if there are any callbacks ready for immediate invocation, return true. + */ +static bool rcu_try_advance_all_cbs(void) +{ + bool cbs_ready = false; + struct rcu_data *rdp; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_node *rnp; + struct rcu_state *rsp; + + /* Exit early if we advanced recently. */ + if (jiffies == rdtp->last_advance_all) + return 0; + rdtp->last_advance_all = jiffies; + + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + + /* + * Don't bother checking unless a grace period has + * completed since we last checked and there are + * callbacks not yet ready to invoke. + */ + if (rdp->completed != rnp->completed && + rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) + note_gp_changes(rsp, rdp); + + if (cpu_has_callbacks_ready_to_invoke(rdp)) + cbs_ready = true; + } + return cbs_ready; +} + +/* + * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready + * to invoke. If the CPU has callbacks, try to advance them. Tell the + * caller to set the timeout based on whether or not there are non-lazy + * callbacks. + * + * The caller must have disabled interrupts. + */ +int rcu_needs_cpu(int cpu, unsigned long *dj) +{ + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + /* Snapshot to detect later posting of non-lazy callback. */ + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; + + /* If no callbacks, RCU doesn't need the CPU. */ + if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { + *dj = ULONG_MAX; + return 0; + } + + /* Attempt to advance callbacks. */ + if (rcu_try_advance_all_cbs()) { + /* Some ready to invoke, so initiate later invocation. */ + invoke_rcu_core(); + return 1; + } + rdtp->last_accelerate = jiffies; + + /* Request timer delay depending on laziness, and round. */ + if (!rdtp->all_lazy) { + *dj = round_up(rcu_idle_gp_delay + jiffies, + rcu_idle_gp_delay) - jiffies; + } else { + *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; + } + return 0; +} + +/* + * Prepare a CPU for idle from an RCU perspective. The first major task + * is to sense whether nohz mode has been enabled or disabled via sysfs. + * The second major task is to check to see if a non-lazy callback has + * arrived at a CPU that previously had only lazy callbacks. The third + * major task is to accelerate (that is, assign grace-period numbers to) + * any recently arrived callbacks. + * + * The caller must have disabled interrupts. + */ +static void rcu_prepare_for_idle(int cpu) +{ + struct rcu_data *rdp; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_node *rnp; + struct rcu_state *rsp; + int tne; + + /* Handle nohz enablement switches conservatively. */ + tne = ACCESS_ONCE(tick_nohz_enabled); + if (tne != rdtp->tick_nohz_enabled_snap) { + if (rcu_cpu_has_callbacks(cpu, NULL)) + invoke_rcu_core(); /* force nohz to see update. */ + rdtp->tick_nohz_enabled_snap = tne; + return; + } + if (!tne) + return; + + /* If this is a no-CBs CPU, no callbacks, just return. */ + if (rcu_is_nocb_cpu(cpu)) + return; + + /* + * If a non-lazy callback arrived at a CPU having only lazy + * callbacks, invoke RCU core for the side-effect of recalculating + * idle duration on re-entry to idle. + */ + if (rdtp->all_lazy && + rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { + rdtp->all_lazy = false; + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; + invoke_rcu_core(); + return; + } + + /* + * If we have not yet accelerated this jiffy, accelerate all + * callbacks on this CPU. + */ + if (rdtp->last_accelerate == jiffies) + return; + rdtp->last_accelerate = jiffies; + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (!*rdp->nxttail[RCU_DONE_TAIL]) + continue; + rnp = rdp->mynode; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } +} + +/* + * Clean up for exit from idle. Attempt to advance callbacks based on + * any grace periods that elapsed while the CPU was idle, and if any + * callbacks are now ready to invoke, initiate invocation. + */ +static void rcu_cleanup_after_idle(int cpu) +{ + + if (rcu_is_nocb_cpu(cpu)) + return; + if (rcu_try_advance_all_cbs()) + invoke_rcu_core(); +} + +/* + * Keep a running count of the number of non-lazy callbacks posted + * on this CPU. This running counter (which is never decremented) allows + * rcu_prepare_for_idle() to detect when something out of the idle loop + * posts a callback, even if an equal number of callbacks are invoked. + * Of course, callbacks should only be posted from within a trace event + * designed to be called from idle or from within RCU_NONIDLE(). + */ +static void rcu_idle_count_callbacks_posted(void) +{ + __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); +} + +/* + * Data for flushing lazy RCU callbacks at OOM time. + */ +static atomic_t oom_callback_count; +static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); + +/* + * RCU OOM callback -- decrement the outstanding count and deliver the + * wake-up if we are the last one. + */ +static void rcu_oom_callback(struct rcu_head *rhp) +{ + if (atomic_dec_and_test(&oom_callback_count)) + wake_up(&oom_callback_wq); +} + +/* + * Post an rcu_oom_notify callback on the current CPU if it has at + * least one lazy callback. This will unnecessarily post callbacks + * to CPUs that already have a non-lazy callback at the end of their + * callback list, but this is an infrequent operation, so accept some + * extra overhead to keep things simple. + */ +static void rcu_oom_notify_cpu(void *unused) +{ + struct rcu_state *rsp; + struct rcu_data *rdp; + + for_each_rcu_flavor(rsp) { + rdp = __this_cpu_ptr(rsp->rda); + if (rdp->qlen_lazy != 0) { + atomic_inc(&oom_callback_count); + rsp->call(&rdp->oom_head, rcu_oom_callback); + } + } +} + +/* + * If low on memory, ensure that each CPU has a non-lazy callback. + * This will wake up CPUs that have only lazy callbacks, in turn + * ensuring that they free up the corresponding memory in a timely manner. + * Because an uncertain amount of memory will be freed in some uncertain + * timeframe, we do not claim to have freed anything. + */ +static int rcu_oom_notify(struct notifier_block *self, + unsigned long notused, void *nfreed) +{ + int cpu; + + /* Wait for callbacks from earlier instance to complete. */ + wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); + + /* + * Prevent premature wakeup: ensure that all increments happen + * before there is a chance of the counter reaching zero. + */ + atomic_set(&oom_callback_count, 1); + + get_online_cpus(); + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); + cond_resched(); + } + put_online_cpus(); + + /* Unconditionally decrement: no need to wake ourselves up. */ + atomic_dec(&oom_callback_count); + + return NOTIFY_OK; +} + +static struct notifier_block rcu_oom_nb = { + .notifier_call = rcu_oom_notify +}; + +static int __init rcu_register_oom_notifier(void) +{ + register_oom_notifier(&rcu_oom_nb); + return 0; +} +early_initcall(rcu_register_oom_notifier); + +#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ + +#ifdef CONFIG_RCU_CPU_STALL_INFO + +#ifdef CONFIG_RCU_FAST_NO_HZ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; + + sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", + rdtp->last_accelerate & 0xffff, jiffies & 0xffff, + ulong2long(nlpd), + rdtp->all_lazy ? 'L' : '.', + rdtp->tick_nohz_enabled_snap ? '.' : 'D'); +} + +#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ + *cp = '\0'; +} + +#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ + +/* Initiate the stall-info list. */ +static void print_cpu_stall_info_begin(void) +{ + pr_cont("\n"); +} + +/* + * Print out diagnostic information for the specified stalled CPU. + * + * If the specified CPU is aware of the current RCU grace period + * (flavor specified by rsp), then print the number of scheduling + * clock interrupts the CPU has taken during the time that it has + * been aware. Otherwise, print the number of RCU grace periods + * that this CPU is ignorant of, for example, "1" if the CPU was + * aware of the previous grace period. + * + * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + */ +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) +{ + char fast_no_hz[72]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = rdp->dynticks; + char *ticks_title; + unsigned long ticks_value; + + if (rsp->gpnum == rdp->gpnum) { + ticks_title = "ticks this GP"; + ticks_value = rdp->ticks_this_gp; + } else { + ticks_title = "GPs behind"; + ticks_value = rsp->gpnum - rdp->gpnum; + } + print_cpu_stall_fast_no_hz(fast_no_hz, cpu); + pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", + cpu, ticks_value, ticks_title, + atomic_read(&rdtp->dynticks) & 0xfff, + rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, + rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), + fast_no_hz); +} + +/* Terminate the stall-info list. */ +static void print_cpu_stall_info_end(void) +{ + pr_err("\t"); +} + +/* Zero ->ticks_this_gp for all flavors of RCU. */ +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ + rdp->ticks_this_gp = 0; + rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); +} + +/* Increment ->ticks_this_gp for all flavors of RCU. */ +static void increment_cpu_stall_ticks(void) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + __this_cpu_ptr(rsp->rda)->ticks_this_gp++; +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +static void print_cpu_stall_info_begin(void) +{ + pr_cont(" {"); +} + +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) +{ + pr_cont(" %d", cpu); +} + +static void print_cpu_stall_info_end(void) +{ + pr_cont("} "); +} + +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ +} + +static void increment_cpu_stall_ticks(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +#ifdef CONFIG_RCU_NOCB_CPU + +/* + * Offload callback processing from the boot-time-specified set of CPUs + * specified by rcu_nocb_mask. For each CPU in the set, there is a + * kthread created that pulls the callbacks from the corresponding CPU, + * waits for a grace period to elapse, and invokes the callbacks. + * The no-CBs CPUs do a wake_up() on their kthread when they insert + * a callback into any empty list, unless the rcu_nocb_poll boot parameter + * has been specified, in which case each kthread actively polls its + * CPU. (Which isn't so great for energy efficiency, but which does + * reduce RCU's overhead on that CPU.) + * + * This is intended to be used in conjunction with Frederic Weisbecker's + * adaptive-idle work, which would seriously reduce OS jitter on CPUs + * running CPU-bound user-mode computations. + * + * Offloading of callback processing could also in theory be used as + * an energy-efficiency measure because CPUs with no RCU callbacks + * queued are more aggressive about entering dyntick-idle mode. + */ + + +/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ +static int __init rcu_nocb_setup(char *str) +{ + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + have_rcu_nocb_mask = true; + cpulist_parse(str, rcu_nocb_mask); + return 1; +} +__setup("rcu_nocbs=", rcu_nocb_setup); + +static int __init parse_rcu_nocb_poll(char *arg) +{ + rcu_nocb_poll = 1; + return 0; +} +early_param("rcu_nocb_poll", parse_rcu_nocb_poll); + +/* + * Do any no-CBs CPUs need another grace period? + * + * Interrupts must be disabled. If the caller does not hold the root + * rnp_node structure's ->lock, the results are advisory only. + */ +static int rcu_nocb_needs_gp(struct rcu_state *rsp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; +} + +/* + * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended + * grace period. + */ +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ + wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); +} + +/* + * Set the root rcu_node structure's ->need_future_gp field + * based on the sum of those of all rcu_node structures. This does + * double-count the root rcu_node structure's requests, but this + * is necessary to handle the possibility of a rcu_nocb_kthread() + * having awakened during the time that the rcu_node structures + * were being updated for the end of the previous grace period. + */ +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) +{ + rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ + init_waitqueue_head(&rnp->nocb_gp_wq[0]); + init_waitqueue_head(&rnp->nocb_gp_wq[1]); +} + +/* Is the specified CPU a no-CPUs CPU? */ +bool rcu_is_nocb_cpu(int cpu) +{ + if (have_rcu_nocb_mask) + return cpumask_test_cpu(cpu, rcu_nocb_mask); + return false; +} + +/* + * Enqueue the specified string of rcu_head structures onto the specified + * CPU's no-CBs lists. The CPU is specified by rdp, the head of the + * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy + * counts are supplied by rhcount and rhcount_lazy. + * + * If warranted, also wake up the kthread servicing this CPUs queues. + */ +static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, + struct rcu_head *rhp, + struct rcu_head **rhtp, + int rhcount, int rhcount_lazy) +{ + int len; + struct rcu_head **old_rhpp; + struct task_struct *t; + + /* Enqueue the callback on the nocb list and update counts. */ + old_rhpp = xchg(&rdp->nocb_tail, rhtp); + ACCESS_ONCE(*old_rhpp) = rhp; + atomic_long_add(rhcount, &rdp->nocb_q_count); + atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); + + /* If we are not being polled and there is a kthread, awaken it ... */ + t = ACCESS_ONCE(rdp->nocb_kthread); + if (rcu_nocb_poll || !t) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WakeNotPoll")); + return; + } + len = atomic_long_read(&rdp->nocb_q_count); + if (old_rhpp == &rdp->nocb_head) { + wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ + rdp->qlen_last_fqs_check = 0; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); + } else if (len > rdp->qlen_last_fqs_check + qhimark) { + wake_up_process(t); /* ... or if many callbacks queued. */ + rdp->qlen_last_fqs_check = LONG_MAX / 2; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); + } else { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); + } + return; +} + +/* + * This is a helper for __call_rcu(), which invokes this when the normal + * callback queue is inoperable. If this is not a no-CBs CPU, this + * function returns failure back to __call_rcu(), which can complain + * appropriately. + * + * Otherwise, this function queues the callback where the corresponding + * "rcuo" kthread can find it. + */ +static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, + bool lazy) +{ + + if (!rcu_is_nocb_cpu(rdp->cpu)) + return 0; + __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); + if (__is_kfree_rcu_offset((unsigned long)rhp->func)) + trace_rcu_kfree_callback(rdp->rsp->name, rhp, + (unsigned long)rhp->func, + -atomic_long_read(&rdp->nocb_q_count_lazy), + -atomic_long_read(&rdp->nocb_q_count)); + else + trace_rcu_callback(rdp->rsp->name, rhp, + -atomic_long_read(&rdp->nocb_q_count_lazy), + -atomic_long_read(&rdp->nocb_q_count)); + return 1; +} + +/* + * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is + * not a no-CBs CPU. + */ +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, + struct rcu_data *rdp) +{ + long ql = rsp->qlen; + long qll = rsp->qlen_lazy; + + /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ + if (!rcu_is_nocb_cpu(smp_processor_id())) + return 0; + rsp->qlen = 0; + rsp->qlen_lazy = 0; + + /* First, enqueue the donelist, if any. This preserves CB ordering. */ + if (rsp->orphan_donelist != NULL) { + __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, + rsp->orphan_donetail, ql, qll); + ql = qll = 0; + rsp->orphan_donelist = NULL; + rsp->orphan_donetail = &rsp->orphan_donelist; + } + if (rsp->orphan_nxtlist != NULL) { + __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, + rsp->orphan_nxttail, ql, qll); + ql = qll = 0; + rsp->orphan_nxtlist = NULL; + rsp->orphan_nxttail = &rsp->orphan_nxtlist; + } + return 1; +} + +/* + * If necessary, kick off a new grace period, and either way wait + * for a subsequent grace period to complete. + */ +static void rcu_nocb_wait_gp(struct rcu_data *rdp) +{ + unsigned long c; + bool d; + unsigned long flags; + struct rcu_node *rnp = rdp->mynode; + + raw_spin_lock_irqsave(&rnp->lock, flags); + c = rcu_start_future_gp(rnp, rdp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* + * Wait for the grace period. Do so interruptibly to avoid messing + * up the load average. + */ + trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); + for (;;) { + wait_event_interruptible( + rnp->nocb_gp_wq[c & 0x1], + (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); + if (likely(d)) + break; + flush_signals(current); + trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); + } + trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); + smp_mb(); /* Ensure that CB invocation happens after GP end. */ +} + +/* + * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes + * callbacks queued by the corresponding no-CBs CPU. + */ +static int rcu_nocb_kthread(void *arg) +{ + int c, cl; + bool firsttime = 1; + struct rcu_head *list; + struct rcu_head *next; + struct rcu_head **tail; + struct rcu_data *rdp = arg; + + /* Each pass through this loop invokes one batch of callbacks */ + for (;;) { + /* If not polling, wait for next batch of callbacks. */ + if (!rcu_nocb_poll) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("Sleep")); + wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); + } else if (firsttime) { + firsttime = 0; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("Poll")); + } + list = ACCESS_ONCE(rdp->nocb_head); + if (!list) { + if (!rcu_nocb_poll) + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WokeEmpty")); + schedule_timeout_interruptible(1); + flush_signals(current); + continue; + } + firsttime = 1; + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WokeNonEmpty")); + + /* + * Extract queued callbacks, update counts, and wait + * for a grace period to elapse. + */ + ACCESS_ONCE(rdp->nocb_head) = NULL; + tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); + c = atomic_long_xchg(&rdp->nocb_q_count, 0); + cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); + ACCESS_ONCE(rdp->nocb_p_count) += c; + ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; + rcu_nocb_wait_gp(rdp); + + /* Each pass through the following loop invokes a callback. */ + trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); + c = cl = 0; + while (list) { + next = list->next; + /* Wait for enqueuing to complete, if needed. */ + while (next == NULL && &list->next != tail) { + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WaitQueue")); + schedule_timeout_interruptible(1); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, + TPS("WokeQueue")); + next = list->next; + } + debug_rcu_head_unqueue(list); + local_bh_disable(); + if (__rcu_reclaim(rdp->rsp->name, list)) + cl++; + c++; + local_bh_enable(); + list = next; + } + trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); + ACCESS_ONCE(rdp->nocb_p_count) -= c; + ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; + rdp->n_nocbs_invoked += c; + } + return 0; +} + +/* Initialize per-rcu_data variables for no-CBs CPUs. */ +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ + rdp->nocb_tail = &rdp->nocb_head; + init_waitqueue_head(&rdp->nocb_wq); +} + +/* Create a kthread for each RCU flavor for each no-CBs CPU. */ +static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +{ + int cpu; + struct rcu_data *rdp; + struct task_struct *t; + + if (rcu_nocb_mask == NULL) + return; + for_each_cpu(cpu, rcu_nocb_mask) { + rdp = per_cpu_ptr(rsp->rda, cpu); + t = kthread_run(rcu_nocb_kthread, rdp, + "rcuo%c/%d", rsp->abbr, cpu); + BUG_ON(IS_ERR(t)); + ACCESS_ONCE(rdp->nocb_kthread) = t; + } +} + +/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ +static bool init_nocb_callback_list(struct rcu_data *rdp) +{ + if (rcu_nocb_mask == NULL || + !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) + return false; + rdp->nxttail[RCU_NEXT_TAIL] = NULL; + return true; +} + +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +static int rcu_nocb_needs_gp(struct rcu_state *rsp) +{ + return 0; +} + +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ +} + +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) +{ +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ +} + +static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, + bool lazy) +{ + return 0; +} + +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, + struct rcu_data *rdp) +{ + return 0; +} + +static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) +{ +} + +static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) +{ +} + +static bool init_nocb_callback_list(struct rcu_data *rdp) +{ + return false; +} + +#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ + +/* + * An adaptive-ticks CPU can potentially execute in kernel mode for an + * arbitrarily long period of time with the scheduling-clock tick turned + * off. RCU will be paying attention to this CPU because it is in the + * kernel, but the CPU cannot be guaranteed to be executing the RCU state + * machine because the scheduling-clock tick has been disabled. Therefore, + * if an adaptive-ticks CPU is failing to respond to the current grace + * period and has not be idle from an RCU perspective, kick it. + */ +static void rcu_kick_nohz_cpu(int cpu) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_cpu(cpu)) + smp_send_reschedule(cpu); +#endif /* #ifdef CONFIG_NO_HZ_FULL */ +} + + +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + +/* + * Define RCU flavor that holds sysidle state. This needs to be the + * most active flavor of RCU. + */ +#ifdef CONFIG_PREEMPT_RCU +static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; +#else /* #ifdef CONFIG_PREEMPT_RCU */ +static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + +static int full_sysidle_state; /* Current system-idle state. */ +#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ +#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ +#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ +#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ +#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ + +/* + * Invoked to note exit from irq or task transition to idle. Note that + * usermode execution does -not- count as idle here! After all, we want + * to detect full-system idle states, not RCU quiescent states and grace + * periods. The caller must have disabled interrupts. + */ +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +{ + unsigned long j; + + /* Adjust nesting, check for fully idle. */ + if (irq) { + rdtp->dynticks_idle_nesting--; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); + if (rdtp->dynticks_idle_nesting != 0) + return; /* Still not fully idle. */ + } else { + if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == + DYNTICK_TASK_NEST_VALUE) { + rdtp->dynticks_idle_nesting = 0; + } else { + rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); + return; /* Still not fully idle. */ + } + } + + /* Record start of fully idle period. */ + j = jiffies; + ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; + smp_mb__before_atomic_inc(); + atomic_inc(&rdtp->dynticks_idle); + smp_mb__after_atomic_inc(); + WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); +} + +/* + * Unconditionally force exit from full system-idle state. This is + * invoked when a normal CPU exits idle, but must be called separately + * for the timekeeping CPU (tick_do_timer_cpu). The reason for this + * is that the timekeeping CPU is permitted to take scheduling-clock + * interrupts while the system is in system-idle state, and of course + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock + * interrupt from any other type of interrupt. + */ +void rcu_sysidle_force_exit(void) +{ + int oldstate = ACCESS_ONCE(full_sysidle_state); + int newoldstate; + + /* + * Each pass through the following loop attempts to exit full + * system-idle state. If contention proves to be a problem, + * a trylock-based contention tree could be used here. + */ + while (oldstate > RCU_SYSIDLE_SHORT) { + newoldstate = cmpxchg(&full_sysidle_state, + oldstate, RCU_SYSIDLE_NOT); + if (oldstate == newoldstate && + oldstate == RCU_SYSIDLE_FULL_NOTED) { + rcu_kick_nohz_cpu(tick_do_timer_cpu); + return; /* We cleared it, done! */ + } + oldstate = newoldstate; + } + smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ +} + +/* + * Invoked to note entry to irq or task transition from idle. Note that + * usermode execution does -not- count as idle here! The caller must + * have disabled interrupts. + */ +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +{ + /* Adjust nesting, check for already non-idle. */ + if (irq) { + rdtp->dynticks_idle_nesting++; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); + if (rdtp->dynticks_idle_nesting != 1) + return; /* Already non-idle. */ + } else { + /* + * Allow for irq misnesting. Yes, it really is possible + * to enter an irq handler then never leave it, and maybe + * also vice versa. Handle both possibilities. + */ + if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { + rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); + return; /* Already non-idle. */ + } else { + rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; + } + } + + /* Record end of idle period. */ + smp_mb__before_atomic_inc(); + atomic_inc(&rdtp->dynticks_idle); + smp_mb__after_atomic_inc(); + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); + + /* + * If we are the timekeeping CPU, we are permitted to be non-idle + * during a system-idle state. This must be the case, because + * the timekeeping CPU has to take scheduling-clock interrupts + * during the time that the system is transitioning to full + * system-idle state. This means that the timekeeping CPU must + * invoke rcu_sysidle_force_exit() directly if it does anything + * more than take a scheduling-clock interrupt. + */ + if (smp_processor_id() == tick_do_timer_cpu) + return; + + /* Update system-idle state: We are clearly no longer fully idle! */ + rcu_sysidle_force_exit(); +} + +/* + * Check to see if the current CPU is idle. Note that usermode execution + * does not count as idle. The caller must have disabled interrupts. + */ +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj) +{ + int cur; + unsigned long j; + struct rcu_dynticks *rdtp = rdp->dynticks; + + /* + * If some other CPU has already reported non-idle, if this is + * not the flavor of RCU that tracks sysidle state, or if this + * is an offline or the timekeeping CPU, nothing to do. + */ + if (!*isidle || rdp->rsp != rcu_sysidle_state || + cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) + return; + if (rcu_gp_in_progress(rdp->rsp)) + WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); + + /* Pick up current idle and NMI-nesting counter and check. */ + cur = atomic_read(&rdtp->dynticks_idle); + if (cur & 0x1) { + *isidle = false; /* We are not idle! */ + return; + } + smp_mb(); /* Read counters before timestamps. */ + + /* Pick up timestamps. */ + j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); + /* If this CPU entered idle more recently, update maxj timestamp. */ + if (ULONG_CMP_LT(*maxj, j)) + *maxj = j; +} + +/* + * Is this the flavor of RCU that is handling full-system idle? + */ +static bool is_sysidle_rcu_state(struct rcu_state *rsp) +{ + return rsp == rcu_sysidle_state; +} + +/* + * Bind the grace-period kthread for the sysidle flavor of RCU to the + * timekeeping CPU. + */ +static void rcu_bind_gp_kthread(void) +{ + int cpu = ACCESS_ONCE(tick_do_timer_cpu); + + if (cpu < 0 || cpu >= nr_cpu_ids) + return; + if (raw_smp_processor_id() != cpu) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + +/* + * Return a delay in jiffies based on the number of CPUs, rcu_node + * leaf fanout, and jiffies tick rate. The idea is to allow larger + * systems more time to transition to full-idle state in order to + * avoid the cache thrashing that otherwise occur on the state variable. + * Really small systems (less than a couple of tens of CPUs) should + * instead use a single global atomically incremented counter, and later + * versions of this will automatically reconfigure themselves accordingly. + */ +static unsigned long rcu_sysidle_delay(void) +{ + if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) + return 0; + return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); +} + +/* + * Advance the full-system-idle state. This is invoked when all of + * the non-timekeeping CPUs are idle. + */ +static void rcu_sysidle(unsigned long j) +{ + /* Check the current state. */ + switch (ACCESS_ONCE(full_sysidle_state)) { + case RCU_SYSIDLE_NOT: + + /* First time all are idle, so note a short idle period. */ + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; + break; + + case RCU_SYSIDLE_SHORT: + + /* + * Idle for a bit, time to advance to next state? + * cmpxchg failure means race with non-idle, let them win. + */ + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) + (void)cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); + break; + + case RCU_SYSIDLE_LONG: + + /* + * Do an additional check pass before advancing to full. + * cmpxchg failure means race with non-idle, let them win. + */ + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) + (void)cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); + break; + + default: + break; + } +} + +/* + * Found a non-idle non-timekeeping CPU, so kick the system-idle state + * back to the beginning. + */ +static void rcu_sysidle_cancel(void) +{ + smp_mb(); + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; +} + +/* + * Update the sysidle state based on the results of a force-quiescent-state + * scan of the CPUs' dyntick-idle state. + */ +static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, + unsigned long maxj, bool gpkt) +{ + if (rsp != rcu_sysidle_state) + return; /* Wrong flavor, ignore. */ + if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) + return; /* Running state machine from timekeeping CPU. */ + if (isidle) + rcu_sysidle(maxj); /* More idle! */ + else + rcu_sysidle_cancel(); /* Idle is over. */ +} + +/* + * Wrapper for rcu_sysidle_report() when called from the grace-period + * kthread's context. + */ +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj) +{ + rcu_sysidle_report(rsp, isidle, maxj, true); +} + +/* Callback and function for forcing an RCU grace period. */ +struct rcu_sysidle_head { + struct rcu_head rh; + int inuse; +}; + +static void rcu_sysidle_cb(struct rcu_head *rhp) +{ + struct rcu_sysidle_head *rshp; + + /* + * The following memory barrier is needed to replace the + * memory barriers that would normally be in the memory + * allocator. + */ + smp_mb(); /* grace period precedes setting inuse. */ + + rshp = container_of(rhp, struct rcu_sysidle_head, rh); + ACCESS_ONCE(rshp->inuse) = 0; +} + +/* + * Check to see if the system is fully idle, other than the timekeeping CPU. + * The caller must have disabled interrupts. + */ +bool rcu_sys_is_idle(void) +{ + static struct rcu_sysidle_head rsh; + int rss = ACCESS_ONCE(full_sysidle_state); + + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) + return false; + + /* Handle small-system case by doing a full scan of CPUs. */ + if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { + int oldrss = rss - 1; + + /* + * One pass to advance to each state up to _FULL. + * Give up if any pass fails to advance the state. + */ + while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { + int cpu; + bool isidle = true; + unsigned long maxj = jiffies - ULONG_MAX / 4; + struct rcu_data *rdp; + + /* Scan all the CPUs looking for nonidle CPUs. */ + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); + rcu_sysidle_check_cpu(rdp, &isidle, &maxj); + if (!isidle) + break; + } + rcu_sysidle_report(rcu_sysidle_state, + isidle, maxj, false); + oldrss = rss; + rss = ACCESS_ONCE(full_sysidle_state); + } + } + + /* If this is the first observation of an idle period, record it. */ + if (rss == RCU_SYSIDLE_FULL) { + rss = cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); + return rss == RCU_SYSIDLE_FULL; + } + + smp_mb(); /* ensure rss load happens before later caller actions. */ + + /* If already fully idle, tell the caller (in case of races). */ + if (rss == RCU_SYSIDLE_FULL_NOTED) + return true; + + /* + * If we aren't there yet, and a grace period is not in flight, + * initiate a grace period. Either way, tell the caller that + * we are not there yet. We use an xchg() rather than an assignment + * to make up for the memory barriers that would otherwise be + * provided by the memory allocator. + */ + if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && + !rcu_gp_in_progress(rcu_sysidle_state) && + !rsh.inuse && xchg(&rsh.inuse, 1) == 0) + call_rcu(&rsh.rh, rcu_sysidle_cb); + return false; +} + +/* + * Initialize dynticks sysidle state for CPUs coming online. + */ +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) +{ + rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; +} + +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +{ +} + +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +{ +} + +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj) +{ +} + +static bool is_sysidle_rcu_state(struct rcu_state *rsp) +{ + return false; +} + +static void rcu_bind_gp_kthread(void) +{ +} + +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj) +{ +} + +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c new file mode 100644 index 000000000000..3596797b7e46 --- /dev/null +++ b/kernel/rcu/tree_trace.c @@ -0,0 +1,500 @@ +/* + * Read-Copy Update tracing for classic implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Papers: http://www.rdrop.com/users/paulmck/RCU + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RCU_TREE_NONCORE +#include "tree.h" + +static int r_open(struct inode *inode, struct file *file, + const struct seq_operations *op) +{ + int ret = seq_open(file, op); + if (!ret) { + struct seq_file *m = (struct seq_file *)file->private_data; + m->private = inode->i_private; + } + return ret; +} + +static void *r_start(struct seq_file *m, loff_t *pos) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + *pos = cpumask_next(*pos - 1, cpu_possible_mask); + if ((*pos) < nr_cpu_ids) + return per_cpu_ptr(rsp->rda, *pos); + return NULL; +} + +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return r_start(m, pos); +} + +static void r_stop(struct seq_file *m, void *v) +{ +} + +static int show_rcubarrier(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + seq_printf(m, "bcc: %d nbd: %lu\n", + atomic_read(&rsp->barrier_cpu_count), + rsp->n_barrier_done); + return 0; +} + +static int rcubarrier_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcubarrier, inode->i_private); +} + +static const struct file_operations rcubarrier_fops = { + .owner = THIS_MODULE, + .open = rcubarrier_open, + .read = seq_read, + .llseek = no_llseek, + .release = single_release, +}; + +#ifdef CONFIG_RCU_BOOST + +static char convert_kthread_status(unsigned int kthread_status) +{ + if (kthread_status > RCU_KTHREAD_MAX) + return '?'; + return "SRWOY"[kthread_status]; +} + +#endif /* #ifdef CONFIG_RCU_BOOST */ + +static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) +{ + long ql, qll; + + if (!rdp->beenonline) + return; + seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", + rdp->cpu, + cpu_is_offline(rdp->cpu) ? '!' : ' ', + ulong2long(rdp->completed), ulong2long(rdp->gpnum), + rdp->passed_quiesce, rdp->qs_pending); + seq_printf(m, " dt=%d/%llx/%d df=%lu", + atomic_read(&rdp->dynticks->dynticks), + rdp->dynticks->dynticks_nesting, + rdp->dynticks->dynticks_nmi_nesting, + rdp->dynticks_fqs); + seq_printf(m, " of=%lu", rdp->offline_fqs); + rcu_nocb_q_lengths(rdp, &ql, &qll); + qll += rdp->qlen_lazy; + ql += rdp->qlen; + seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", + qll, ql, + ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != + rdp->nxttail[RCU_NEXT_TAIL]], + ".R"[rdp->nxttail[RCU_WAIT_TAIL] != + rdp->nxttail[RCU_NEXT_READY_TAIL]], + ".W"[rdp->nxttail[RCU_DONE_TAIL] != + rdp->nxttail[RCU_WAIT_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); +#ifdef CONFIG_RCU_BOOST + seq_printf(m, " kt=%d/%c ktl=%x", + per_cpu(rcu_cpu_has_work, rdp->cpu), + convert_kthread_status(per_cpu(rcu_cpu_kthread_status, + rdp->cpu)), + per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); +#endif /* #ifdef CONFIG_RCU_BOOST */ + seq_printf(m, " b=%ld", rdp->blimit); + seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", + rdp->n_cbs_invoked, rdp->n_nocbs_invoked, + rdp->n_cbs_orphaned, rdp->n_cbs_adopted); +} + +static int show_rcudata(struct seq_file *m, void *v) +{ + print_one_rcu_data(m, (struct rcu_data *)v); + return 0; +} + +static const struct seq_operations rcudate_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = show_rcudata, +}; + +static int rcudata_open(struct inode *inode, struct file *file) +{ + return r_open(inode, file, &rcudate_op); +} + +static const struct file_operations rcudata_fops = { + .owner = THIS_MODULE, + .open = rcudata_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + +static int show_rcuexp(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + + seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", + atomic_long_read(&rsp->expedited_start), + atomic_long_read(&rsp->expedited_done), + atomic_long_read(&rsp->expedited_wrap), + atomic_long_read(&rsp->expedited_tryfail), + atomic_long_read(&rsp->expedited_workdone1), + atomic_long_read(&rsp->expedited_workdone2), + atomic_long_read(&rsp->expedited_normal), + atomic_long_read(&rsp->expedited_stoppedcpus), + atomic_long_read(&rsp->expedited_done_tries), + atomic_long_read(&rsp->expedited_done_lost), + atomic_long_read(&rsp->expedited_done_exit)); + return 0; +} + +static int rcuexp_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcuexp, inode->i_private); +} + +static const struct file_operations rcuexp_fops = { + .owner = THIS_MODULE, + .open = rcuexp_open, + .read = seq_read, + .llseek = no_llseek, + .release = single_release, +}; + +#ifdef CONFIG_RCU_BOOST + +static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) +{ + seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", + rnp->grplo, rnp->grphi, + "T."[list_empty(&rnp->blkd_tasks)], + "N."[!rnp->gp_tasks], + "E."[!rnp->exp_tasks], + "B."[!rnp->boost_tasks], + convert_kthread_status(rnp->boost_kthread_status), + rnp->n_tasks_boosted, rnp->n_exp_boosts, + rnp->n_normal_boosts); + seq_printf(m, "j=%04x bt=%04x\n", + (int)(jiffies & 0xffff), + (int)(rnp->boost_time & 0xffff)); + seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", + rnp->n_balk_blkd_tasks, + rnp->n_balk_exp_gp_tasks, + rnp->n_balk_boost_tasks, + rnp->n_balk_notblocked, + rnp->n_balk_notyet, + rnp->n_balk_nos); +} + +static int show_rcu_node_boost(struct seq_file *m, void *unused) +{ + struct rcu_node *rnp; + + rcu_for_each_leaf_node(&rcu_preempt_state, rnp) + print_one_rcu_node_boost(m, rnp); + return 0; +} + +static int rcu_node_boost_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcu_node_boost, NULL); +} + +static const struct file_operations rcu_node_boost_fops = { + .owner = THIS_MODULE, + .open = rcu_node_boost_open, + .read = seq_read, + .llseek = no_llseek, + .release = single_release, +}; + +#endif /* #ifdef CONFIG_RCU_BOOST */ + +static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) +{ + unsigned long gpnum; + int level = 0; + struct rcu_node *rnp; + + gpnum = rsp->gpnum; + seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", + ulong2long(rsp->completed), ulong2long(gpnum), + rsp->fqs_state, + (long)(rsp->jiffies_force_qs - jiffies), + (int)(jiffies & 0xffff)); + seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", + rsp->n_force_qs, rsp->n_force_qs_ngp, + rsp->n_force_qs - rsp->n_force_qs_ngp, + rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); + for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { + if (rnp->level != level) { + seq_puts(m, "\n"); + level = rnp->level; + } + seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", + rnp->qsmask, rnp->qsmaskinit, + ".G"[rnp->gp_tasks != NULL], + ".E"[rnp->exp_tasks != NULL], + ".T"[!list_empty(&rnp->blkd_tasks)], + rnp->grplo, rnp->grphi, rnp->grpnum); + } + seq_puts(m, "\n"); +} + +static int show_rcuhier(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + print_one_rcu_state(m, rsp); + return 0; +} + +static int rcuhier_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcuhier, inode->i_private); +} + +static const struct file_operations rcuhier_fops = { + .owner = THIS_MODULE, + .open = rcuhier_open, + .read = seq_read, + .llseek = no_llseek, + .release = single_release, +}; + +static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long completed; + unsigned long gpnum; + unsigned long gpage; + unsigned long gpmax; + struct rcu_node *rnp = &rsp->node[0]; + + raw_spin_lock_irqsave(&rnp->lock, flags); + completed = ACCESS_ONCE(rsp->completed); + gpnum = ACCESS_ONCE(rsp->gpnum); + if (completed == gpnum) + gpage = 0; + else + gpage = jiffies - rsp->gp_start; + gpmax = rsp->gp_max; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", + ulong2long(completed), ulong2long(gpnum), gpage, gpmax); +} + +static int show_rcugp(struct seq_file *m, void *v) +{ + struct rcu_state *rsp = (struct rcu_state *)m->private; + show_one_rcugp(m, rsp); + return 0; +} + +static int rcugp_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcugp, inode->i_private); +} + +static const struct file_operations rcugp_fops = { + .owner = THIS_MODULE, + .open = rcugp_open, + .read = seq_read, + .llseek = no_llseek, + .release = single_release, +}; + +static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) +{ + if (!rdp->beenonline) + return; + seq_printf(m, "%3d%cnp=%ld ", + rdp->cpu, + cpu_is_offline(rdp->cpu) ? '!' : ' ', + rdp->n_rcu_pending); + seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", + rdp->n_rp_qs_pending, + rdp->n_rp_report_qs, + rdp->n_rp_cb_ready, + rdp->n_rp_cpu_needs_gp); + seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", + rdp->n_rp_gp_completed, + rdp->n_rp_gp_started, + rdp->n_rp_need_nothing); +} + +static int show_rcu_pending(struct seq_file *m, void *v) +{ + print_one_rcu_pending(m, (struct rcu_data *)v); + return 0; +} + +static const struct seq_operations rcu_pending_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = show_rcu_pending, +}; + +static int rcu_pending_open(struct inode *inode, struct file *file) +{ + return r_open(inode, file, &rcu_pending_op); +} + +static const struct file_operations rcu_pending_fops = { + .owner = THIS_MODULE, + .open = rcu_pending_open, + .read = seq_read, + .llseek = no_llseek, + .release = seq_release, +}; + +static int show_rcutorture(struct seq_file *m, void *unused) +{ + seq_printf(m, "rcutorture test sequence: %lu %s\n", + rcutorture_testseq >> 1, + (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); + seq_printf(m, "rcutorture update version number: %lu\n", + rcutorture_vernum); + return 0; +} + +static int rcutorture_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcutorture, NULL); +} + +static const struct file_operations rcutorture_fops = { + .owner = THIS_MODULE, + .open = rcutorture_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rcudir; + +static int __init rcutree_trace_init(void) +{ + struct rcu_state *rsp; + struct dentry *retval; + struct dentry *rspdir; + + rcudir = debugfs_create_dir("rcu", NULL); + if (!rcudir) + goto free_out; + + for_each_rcu_flavor(rsp) { + rspdir = debugfs_create_dir(rsp->name, rcudir); + if (!rspdir) + goto free_out; + + retval = debugfs_create_file("rcudata", 0444, + rspdir, rsp, &rcudata_fops); + if (!retval) + goto free_out; + + retval = debugfs_create_file("rcuexp", 0444, + rspdir, rsp, &rcuexp_fops); + if (!retval) + goto free_out; + + retval = debugfs_create_file("rcu_pending", 0444, + rspdir, rsp, &rcu_pending_fops); + if (!retval) + goto free_out; + + retval = debugfs_create_file("rcubarrier", 0444, + rspdir, rsp, &rcubarrier_fops); + if (!retval) + goto free_out; + +#ifdef CONFIG_RCU_BOOST + if (rsp == &rcu_preempt_state) { + retval = debugfs_create_file("rcuboost", 0444, + rspdir, NULL, &rcu_node_boost_fops); + if (!retval) + goto free_out; + } +#endif + + retval = debugfs_create_file("rcugp", 0444, + rspdir, rsp, &rcugp_fops); + if (!retval) + goto free_out; + + retval = debugfs_create_file("rcuhier", 0444, + rspdir, rsp, &rcuhier_fops); + if (!retval) + goto free_out; + } + + retval = debugfs_create_file("rcutorture", 0444, rcudir, + NULL, &rcutorture_fops); + if (!retval) + goto free_out; + return 0; +free_out: + debugfs_remove_recursive(rcudir); + return 1; +} + +static void __exit rcutree_trace_cleanup(void) +{ + debugfs_remove_recursive(rcudir); +} + + +module_init(rcutree_trace_init); +module_exit(rcutree_trace_cleanup); + +MODULE_AUTHOR("Paul E. McKenney"); +MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); +MODULE_LICENSE("GPL"); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c new file mode 100644 index 000000000000..6cb3dff89e2b --- /dev/null +++ b/kernel/rcu/update.c @@ -0,0 +1,347 @@ +/* + * Read-Copy Update mechanism for mutual exclusion + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2001 + * + * Authors: Dipankar Sarma + * Manfred Spraul + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * Papers: + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) + * + * For detailed explanation of Read-Copy Update mechanism see - + * http://lse.sourceforge.net/locking/rcupdate.html + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#include "rcu.h" + +MODULE_ALIAS("rcupdate"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcupdate." + +module_param(rcu_expedited, int, 0); + +#ifdef CONFIG_PREEMPT_RCU + +/* + * Preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* critical section after entry code. */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting != 1) { + --t->rcu_read_lock_nesting; + } else { + barrier(); /* critical section before exit code. */ + t->rcu_read_lock_nesting = INT_MIN; +#ifdef CONFIG_PROVE_RCU_DELAY + udelay(10); /* Make preemption more probable. */ +#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } +#ifdef CONFIG_PROVE_LOCKING + { + int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +#endif /* #ifdef CONFIG_PREEMPT_RCU */ + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_key; +struct lockdep_map rcu_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +EXPORT_SYMBOL_GPL(rcu_lock_map); + +static struct lock_class_key rcu_bh_lock_key; +struct lockdep_map rcu_bh_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); +EXPORT_SYMBOL_GPL(rcu_bh_lock_map); + +static struct lock_class_key rcu_sched_lock_key; +struct lockdep_map rcu_sched_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); +EXPORT_SYMBOL_GPL(rcu_sched_lock_map); + +int notrace debug_lockdep_rcu_enabled(void) +{ + return rcu_scheduler_active && debug_locks && + current->lockdep_recursion == 0; +} +EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); + +/** + * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? + * + * Check for bottom half being disabled, which covers both the + * CONFIG_PROVE_RCU and not cases. Note that if someone uses + * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) + * will show the situation. This is useful for debug checks in functions + * that require that they be called within an RCU read-side critical + * section. + * + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. + * + * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * offline from an RCU perspective, so check for those as well. + */ +int rcu_read_lock_bh_held(void) +{ + if (!debug_lockdep_rcu_enabled()) + return 1; + if (!rcu_is_watching()) + return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; + return in_softirq() || irqs_disabled(); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); + +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +struct rcu_synchronize { + struct rcu_head head; + struct completion completion; +}; + +/* + * Awaken the corresponding synchronize_rcu() instance now that a + * grace period has elapsed. + */ +static void wakeme_after_rcu(struct rcu_head *head) +{ + struct rcu_synchronize *rcu; + + rcu = container_of(head, struct rcu_synchronize, head); + complete(&rcu->completion); +} + +void wait_rcu_gp(call_rcu_func_t crf) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + crf(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(wait_rcu_gp); + +#ifdef CONFIG_PROVE_RCU +/* + * wrapper function to avoid #include problems. + */ +int rcu_my_thread_group_empty(void) +{ + return thread_group_empty(current); +} +EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); +#endif /* #ifdef CONFIG_PROVE_RCU */ + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static inline void debug_init_rcu_head(struct rcu_head *head) +{ + debug_object_init(head, &rcuhead_debug_descr); +} + +static inline void debug_rcu_head_free(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + * Activation is performed internally by call_rcu(). + */ +static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. We just make sure that it is + * tracked in the object tracker. + */ + debug_object_init(head, &rcuhead_debug_descr); + debug_object_activate(head, &rcuhead_debug_descr); + return 0; + default: + return 1; + } +} + +/** + * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects of a new rcu_head structure that + * has been allocated as an auto variable on the stack. This function + * is not required for rcu_head structures that are statically defined or + * that are dynamically allocated on the heap. This function has no + * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void init_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_init_on_stack(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); + +/** + * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects that an on-stack rcu_head structure + * is about to go out of scope. As with init_rcu_head_on_stack(), this + * function is not required for rcu_head structures that are statically + * defined or that are dynamically allocated on the heap. Also as with + * init_rcu_head_on_stack(), this function has no effect for + * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void destroy_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); + +struct debug_obj_descr rcuhead_debug_descr = { + .name = "rcu_head", + .fixup_activate = rcuhead_fixup_activate, +}; +EXPORT_SYMBOL_GPL(rcuhead_debug_descr); +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) +void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, + unsigned long secs, + unsigned long c_old, unsigned long c) +{ + trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c); +} +EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); +#else +#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ + do { } while (0) +#endif + +#ifdef CONFIG_RCU_STALL_COMMON + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif + +int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; + +module_param(rcu_cpu_stall_suppress, int, 0644); +module_param(rcu_cpu_stall_timeout, int, 0644); + +int rcu_jiffies_till_stall_check(void) +{ + int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + + /* + * Limit check must be consistent with the Kconfig limits + * for CONFIG_RCU_CPU_STALL_TIMEOUT. + */ + if (till_stall_check < 3) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; + till_stall_check = 3; + } else if (till_stall_check > 300) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; + till_stall_check = 300; + } + return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} + +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ + rcu_cpu_stall_suppress = 1; + return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { + .notifier_call = rcu_panic, +}; + +static int __init check_cpu_stall_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); + return 0; +} +early_initcall(check_cpu_stall_init); + +#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c deleted file mode 100644 index c07af1c4e1bb..000000000000 --- a/kernel/rcupdate.c +++ /dev/null @@ -1,341 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2001 - * - * Authors: Dipankar Sarma - * Manfred Spraul - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * Papers: - * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf - * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) - * - * For detailed explanation of Read-Copy Update mechanism see - - * http://lse.sourceforge.net/locking/rcupdate.html - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include - -#include "rcu.h" - -module_param(rcu_expedited, int, 0); - -#ifdef CONFIG_PREEMPT_RCU - -/* - * Preemptible RCU implementation for rcu_read_lock(). - * Just increment ->rcu_read_lock_nesting, shared state will be updated - * if we block. - */ -void __rcu_read_lock(void) -{ - current->rcu_read_lock_nesting++; - barrier(); /* critical section after entry code. */ -} -EXPORT_SYMBOL_GPL(__rcu_read_lock); - -/* - * Preemptible RCU implementation for rcu_read_unlock(). - * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost - * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then - * invoke rcu_read_unlock_special() to clean up after a context switch - * in an RCU read-side critical section and other special cases. - */ -void __rcu_read_unlock(void) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting != 1) { - --t->rcu_read_lock_nesting; - } else { - barrier(); /* critical section before exit code. */ - t->rcu_read_lock_nesting = INT_MIN; -#ifdef CONFIG_PROVE_RCU_DELAY - udelay(10); /* Make preemption more probable. */ -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ - barrier(); /* assign before ->rcu_read_unlock_special load */ - if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); - barrier(); /* ->rcu_read_unlock_special load before assign */ - t->rcu_read_lock_nesting = 0; - } -#ifdef CONFIG_PROVE_LOCKING - { - int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); - - WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); - } -#endif /* #ifdef CONFIG_PROVE_LOCKING */ -} -EXPORT_SYMBOL_GPL(__rcu_read_unlock); - -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key rcu_lock_key; -struct lockdep_map rcu_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); -EXPORT_SYMBOL_GPL(rcu_lock_map); - -static struct lock_class_key rcu_bh_lock_key; -struct lockdep_map rcu_bh_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); -EXPORT_SYMBOL_GPL(rcu_bh_lock_map); - -static struct lock_class_key rcu_sched_lock_key; -struct lockdep_map rcu_sched_lock_map = - STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); -EXPORT_SYMBOL_GPL(rcu_sched_lock_map); - -int notrace debug_lockdep_rcu_enabled(void) -{ - return rcu_scheduler_active && debug_locks && - current->lockdep_recursion == 0; -} -EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); - -/** - * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? - * - * Check for bottom half being disabled, which covers both the - * CONFIG_PROVE_RCU and not cases. Note that if someone uses - * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) - * will show the situation. This is useful for debug checks in functions - * that require that they be called within an RCU read-side critical - * section. - * - * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. - * - * Note that rcu_read_lock() is disallowed if the CPU is either idle or - * offline from an RCU perspective, so check for those as well. - */ -int rcu_read_lock_bh_held(void) -{ - if (!debug_lockdep_rcu_enabled()) - return 1; - if (!rcu_is_watching()) - return 0; - if (!rcu_lockdep_current_cpu_online()) - return 0; - return in_softirq() || irqs_disabled(); -} -EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); - -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; -}; - -/* - * Awaken the corresponding synchronize_rcu() instance now that a - * grace period has elapsed. - */ -static void wakeme_after_rcu(struct rcu_head *head) -{ - struct rcu_synchronize *rcu; - - rcu = container_of(head, struct rcu_synchronize, head); - complete(&rcu->completion); -} - -void wait_rcu_gp(call_rcu_func_t crf) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - crf(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(wait_rcu_gp); - -#ifdef CONFIG_PROVE_RCU -/* - * wrapper function to avoid #include problems. - */ -int rcu_my_thread_group_empty(void) -{ - return thread_group_empty(current); -} -EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); -#endif /* #ifdef CONFIG_PROVE_RCU */ - -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD -static inline void debug_init_rcu_head(struct rcu_head *head) -{ - debug_object_init(head, &rcuhead_debug_descr); -} - -static inline void debug_rcu_head_free(struct rcu_head *head) -{ - debug_object_free(head, &rcuhead_debug_descr); -} - -/* - * fixup_activate is called when: - * - an active object is activated - * - an unknown object is activated (might be a statically initialized object) - * Activation is performed internally by call_rcu(). - */ -static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) -{ - struct rcu_head *head = addr; - - switch (state) { - - case ODEBUG_STATE_NOTAVAILABLE: - /* - * This is not really a fixup. We just make sure that it is - * tracked in the object tracker. - */ - debug_object_init(head, &rcuhead_debug_descr); - debug_object_activate(head, &rcuhead_debug_descr); - return 0; - default: - return 1; - } -} - -/** - * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects - * @head: pointer to rcu_head structure to be initialized - * - * This function informs debugobjects of a new rcu_head structure that - * has been allocated as an auto variable on the stack. This function - * is not required for rcu_head structures that are statically defined or - * that are dynamically allocated on the heap. This function has no - * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. - */ -void init_rcu_head_on_stack(struct rcu_head *head) -{ - debug_object_init_on_stack(head, &rcuhead_debug_descr); -} -EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); - -/** - * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects - * @head: pointer to rcu_head structure to be initialized - * - * This function informs debugobjects that an on-stack rcu_head structure - * is about to go out of scope. As with init_rcu_head_on_stack(), this - * function is not required for rcu_head structures that are statically - * defined or that are dynamically allocated on the heap. Also as with - * init_rcu_head_on_stack(), this function has no effect for - * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. - */ -void destroy_rcu_head_on_stack(struct rcu_head *head) -{ - debug_object_free(head, &rcuhead_debug_descr); -} -EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); - -struct debug_obj_descr rcuhead_debug_descr = { - .name = "rcu_head", - .fixup_activate = rcuhead_fixup_activate, -}; -EXPORT_SYMBOL_GPL(rcuhead_debug_descr); -#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - -#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) -void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, - unsigned long secs, - unsigned long c_old, unsigned long c) -{ - trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c); -} -EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); -#else -#define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ - do { } while (0) -#endif - -#ifdef CONFIG_RCU_STALL_COMMON - -#ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) -#else -#define RCU_STALL_DELAY_DELTA 0 -#endif - -int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ -static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; - -module_param(rcu_cpu_stall_suppress, int, 0644); -module_param(rcu_cpu_stall_timeout, int, 0644); - -int rcu_jiffies_till_stall_check(void) -{ - int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); - - /* - * Limit check must be consistent with the Kconfig limits - * for CONFIG_RCU_CPU_STALL_TIMEOUT. - */ - if (till_stall_check < 3) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; - till_stall_check = 3; - } else if (till_stall_check > 300) { - ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; - till_stall_check = 300; - } - return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; -} - -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ - rcu_cpu_stall_suppress = 1; - return NOTIFY_DONE; -} - -static struct notifier_block rcu_panic_block = { - .notifier_call = rcu_panic, -}; - -static int __init check_cpu_stall_init(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); - return 0; -} -early_initcall(check_cpu_stall_init); - -#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c deleted file mode 100644 index 312e9709713f..000000000000 --- a/kernel/rcutiny.c +++ /dev/null @@ -1,388 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2008 - * - * Author: Paul E. McKenney - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_RCU_TRACE -#include -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - -#include "rcu.h" - -/* Forward declarations for rcutiny_plugin.h. */ -struct rcu_ctrlblk; -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); -static void rcu_process_callbacks(struct softirq_action *unused); -static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), - struct rcu_ctrlblk *rcp); - -static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - -#include "rcutiny_plugin.h" - -/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ -static void rcu_idle_enter_common(long long newval) -{ - if (newval) { - RCU_TRACE(trace_rcu_dyntick(TPS("--="), - rcu_dynticks_nesting, newval)); - rcu_dynticks_nesting = newval; - return; - } - RCU_TRACE(trace_rcu_dyntick(TPS("Start"), - rcu_dynticks_nesting, newval)); - if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), - rcu_dynticks_nesting, newval)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } - rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ - barrier(); - rcu_dynticks_nesting = newval; -} - -/* - * Enter idle, which is an extended quiescent state if we have fully - * entered that mode (i.e., if the new value of dynticks_nesting is zero). - */ -void rcu_idle_enter(void) -{ - unsigned long flags; - long long newval; - - local_irq_save(flags); - WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); - if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) - newval = 0; - else - newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(newval); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(rcu_idle_enter); - -/* - * Exit an interrupt handler towards idle. - */ -void rcu_irq_exit(void) -{ - unsigned long flags; - long long newval; - - local_irq_save(flags); - newval = rcu_dynticks_nesting - 1; - WARN_ON_ONCE(newval < 0); - rcu_idle_enter_common(newval); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(rcu_irq_exit); - -/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ -static void rcu_idle_exit_common(long long oldval) -{ - if (oldval) { - RCU_TRACE(trace_rcu_dyntick(TPS("++="), - oldval, rcu_dynticks_nesting)); - return; - } - RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); - if (!is_idle_task(current)) { - struct task_struct *idle = idle_task(smp_processor_id()); - - RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), - oldval, rcu_dynticks_nesting)); - ftrace_dump(DUMP_ALL); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - -/* - * Exit idle, so that we are no longer in an extended quiescent state. - */ -void rcu_idle_exit(void) -{ - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - WARN_ON_ONCE(rcu_dynticks_nesting < 0); - if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) - rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else - rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_idle_exit_common(oldval); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(rcu_idle_exit); - -/* - * Enter an interrupt handler, moving away from idle. - */ -void rcu_irq_enter(void) -{ - unsigned long flags; - long long oldval; - - local_irq_save(flags); - oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting++; - WARN_ON_ONCE(rcu_dynticks_nesting == 0); - rcu_idle_exit_common(oldval); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(rcu_irq_enter); - -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) - -/* - * Test whether RCU thinks that the current CPU is idle. - */ -bool __rcu_is_watching(void) -{ - return rcu_dynticks_nesting; -} -EXPORT_SYMBOL(__rcu_is_watching); - -#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ - -/* - * Test whether the current CPU was interrupted from idle. Nested - * interrupts don't count, we must be running at the first interrupt - * level. - */ -static int rcu_is_cpu_rrupt_from_idle(void) -{ - return rcu_dynticks_nesting <= 1; -} - -/* - * Helper function for rcu_sched_qs() and rcu_bh_qs(). - * Also irqs are disabled to avoid confusion due to interrupt handlers - * invoking call_rcu(). - */ -static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) -{ - RCU_TRACE(reset_cpu_stall_ticks(rcp)); - if (rcp->rcucblist != NULL && - rcp->donetail != rcp->curtail) { - rcp->donetail = rcp->curtail; - return 1; - } - - return 0; -} - -/* - * Record an rcu quiescent state. And an rcu_bh quiescent state while we - * are at it, given that any rcu quiescent state is also an rcu_bh - * quiescent state. Use "+" instead of "||" to defeat short circuiting. - */ -void rcu_sched_qs(int cpu) -{ - unsigned long flags; - - local_irq_save(flags); - if (rcu_qsctr_help(&rcu_sched_ctrlblk) + - rcu_qsctr_help(&rcu_bh_ctrlblk)) - raise_softirq(RCU_SOFTIRQ); - local_irq_restore(flags); -} - -/* - * Record an rcu_bh quiescent state. - */ -void rcu_bh_qs(int cpu) -{ - unsigned long flags; - - local_irq_save(flags); - if (rcu_qsctr_help(&rcu_bh_ctrlblk)) - raise_softirq(RCU_SOFTIRQ); - local_irq_restore(flags); -} - -/* - * Check to see if the scheduling-clock interrupt came from an extended - * quiescent state, and, if so, tell RCU about it. This function must - * be called from hardirq context. It is normally called from the - * scheduling-clock interrupt. - */ -void rcu_check_callbacks(int cpu, int user) -{ - RCU_TRACE(check_cpu_stalls()); - if (user || rcu_is_cpu_rrupt_from_idle()) - rcu_sched_qs(cpu); - else if (!in_softirq()) - rcu_bh_qs(cpu); -} - -/* - * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure - * whose grace period has elapsed. - */ -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) -{ - const char *rn = NULL; - struct rcu_head *next, *list; - unsigned long flags; - RCU_TRACE(int cb_count = 0); - - /* If no RCU callbacks ready to invoke, just return. */ - if (&rcp->rcucblist == rcp->donetail) { - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, - !!ACCESS_ONCE(rcp->rcucblist), - need_resched(), - is_idle_task(current), - false)); - return; - } - - /* Move the ready-to-invoke callbacks to a local list. */ - local_irq_save(flags); - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); - list = rcp->rcucblist; - rcp->rcucblist = *rcp->donetail; - *rcp->donetail = NULL; - if (rcp->curtail == rcp->donetail) - rcp->curtail = &rcp->rcucblist; - rcp->donetail = &rcp->rcucblist; - local_irq_restore(flags); - - /* Invoke the callbacks on the local list. */ - RCU_TRACE(rn = rcp->name); - while (list) { - next = list->next; - prefetch(next); - debug_rcu_head_unqueue(list); - local_bh_disable(); - __rcu_reclaim(rn, list); - local_bh_enable(); - list = next; - RCU_TRACE(cb_count++); - } - RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, - cb_count, 0, need_resched(), - is_idle_task(current), - false)); -} - -static void rcu_process_callbacks(struct softirq_action *unused) -{ - __rcu_process_callbacks(&rcu_sched_ctrlblk); - __rcu_process_callbacks(&rcu_bh_ctrlblk); -} - -/* - * Wait for a grace period to elapse. But it is illegal to invoke - * synchronize_sched() from within an RCU read-side critical section. - * Therefore, any legal call to synchronize_sched() is a quiescent - * state, and so on a UP system, synchronize_sched() need do nothing. - * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the - * benefits of doing might_sleep() to reduce latency.) - * - * Cool, huh? (Due to Josh Triplett.) - * - * But we want to make this a static inline later. The cond_resched() - * currently makes this problematic. - */ -void synchronize_sched(void) -{ - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU read-side critical section"); - cond_resched(); -} -EXPORT_SYMBOL_GPL(synchronize_sched); - -/* - * Helper function for call_rcu() and call_rcu_bh(). - */ -static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), - struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - debug_rcu_head_queue(head); - head->func = func; - head->next = NULL; - - local_irq_save(flags); - *rcp->curtail = head; - rcp->curtail = &head->next; - RCU_TRACE(rcp->qlen++); - local_irq_restore(flags); -} - -/* - * Post an RCU callback to be invoked after the end of an RCU-sched grace - * period. But since we have but one CPU, that would be after any - * quiescent state. - */ -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_sched_ctrlblk); -} -EXPORT_SYMBOL_GPL(call_rcu_sched); - -/* - * Post an RCU bottom-half callback to be invoked after any subsequent - * quiescent state. - */ -void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_bh_ctrlblk); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); - -void rcu_init(void) -{ - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); -} diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h deleted file mode 100644 index 280d06cae352..000000000000 --- a/kernel/rcutiny_plugin.h +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition - * Internal non-public definitions that provide either classic - * or preemptible semantics. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (c) 2010 Linaro - * - * Author: Paul E. McKenney - */ - -#include -#include -#include -#include - -/* Global control variables for rcupdate callback mechanism. */ -struct rcu_ctrlblk { - struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ - struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ - struct rcu_head **curtail; /* ->next pointer of last CB. */ - RCU_TRACE(long qlen); /* Number of pending CBs. */ - RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ - RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ - RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ - RCU_TRACE(const char *name); /* Name of RCU type. */ -}; - -/* Definition for rcupdate control block. */ -static struct rcu_ctrlblk rcu_sched_ctrlblk = { - .donetail = &rcu_sched_ctrlblk.rcucblist, - .curtail = &rcu_sched_ctrlblk.rcucblist, - RCU_TRACE(.name = "rcu_sched") -}; - -static struct rcu_ctrlblk rcu_bh_ctrlblk = { - .donetail = &rcu_bh_ctrlblk.rcucblist, - .curtail = &rcu_bh_ctrlblk.rcucblist, - RCU_TRACE(.name = "rcu_bh") -}; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -#include - -int rcu_scheduler_active __read_mostly; -EXPORT_SYMBOL_GPL(rcu_scheduler_active); - -/* - * During boot, we forgive RCU lockdep issues. After this function is - * invoked, we start taking RCU lockdep issues seriously. - */ -void __init rcu_scheduler_starting(void) -{ - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; -} - -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -#ifdef CONFIG_RCU_TRACE - -static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) -{ - unsigned long flags; - - local_irq_save(flags); - rcp->qlen -= n; - local_irq_restore(flags); -} - -/* - * Dump statistics for TINY_RCU, such as they are. - */ -static int show_tiny_stats(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); - seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); - return 0; -} - -static int show_tiny_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_tiny_stats, NULL); -} - -static const struct file_operations show_tiny_stats_fops = { - .owner = THIS_MODULE, - .open = show_tiny_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutiny_trace_init(void) -{ - struct dentry *retval; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - retval = debugfs_create_file("rcudata", 0444, rcudir, - NULL, &show_tiny_stats_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} - -static void __exit rcutiny_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - -module_init(rcutiny_trace_init); -module_exit(rcutiny_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); -MODULE_LICENSE("GPL"); - -static void check_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long j; - unsigned long js; - - if (rcu_cpu_stall_suppress) - return; - rcp->ticks_this_gp++; - j = jiffies; - js = rcp->jiffies_stall; - if (*rcp->curtail && ULONG_CMP_GE(j, js)) { - pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", - rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, - jiffies - rcp->gp_start, rcp->qlen); - dump_stack(); - } - if (*rcp->curtail && ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + - 3 * rcu_jiffies_till_stall_check() + 3; - else if (ULONG_CMP_GE(j, js)) - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -} - -static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) -{ - rcp->ticks_this_gp = 0; - rcp->gp_start = jiffies; - rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); -} - -static void check_cpu_stalls(void) -{ - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); -} - -#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c deleted file mode 100644 index be63101c6175..000000000000 --- a/kernel/rcutorture.c +++ /dev/null @@ -1,2139 +0,0 @@ -/* - * Read-Copy Update module-based torture test facility - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2005, 2006 - * - * Authors: Paul E. McKenney - * Josh Triplett - * - * See also: Documentation/RCU/torture.txt - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); - -static int fqs_duration; -module_param(fqs_duration, int, 0444); -MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); -static int fqs_holdoff; -module_param(fqs_holdoff, int, 0444); -MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); -static int fqs_stutter = 3; -module_param(fqs_stutter, int, 0444); -MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); -static bool gp_exp; -module_param(gp_exp, bool, 0444); -MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); -static bool gp_normal; -module_param(gp_normal, bool, 0444); -MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); -static int irqreader = 1; -module_param(irqreader, int, 0444); -MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); -static int n_barrier_cbs; -module_param(n_barrier_cbs, int, 0444); -MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); -static int nfakewriters = 4; -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -static int nreaders = -1; -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -static int object_debug; -module_param(object_debug, int, 0444); -MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); -static int onoff_holdoff; -module_param(onoff_holdoff, int, 0444); -MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); -static int onoff_interval; -module_param(onoff_interval, int, 0444); -MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); -static int shuffle_interval = 3; -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -static int shutdown_secs; -module_param(shutdown_secs, int, 0444); -MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); -static int stall_cpu; -module_param(stall_cpu, int, 0444); -MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); -static int stall_cpu_holdoff = 10; -module_param(stall_cpu_holdoff, int, 0444); -MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); -static int stat_interval = 60; -module_param(stat_interval, int, 0644); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -static int stutter = 5; -module_param(stutter, int, 0444); -MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); -static int test_boost = 1; -module_param(test_boost, int, 0444); -MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -static int test_boost_duration = 4; -module_param(test_boost_duration, int, 0444); -MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); -static int test_boost_interval = 7; -module_param(test_boost_interval, int, 0444); -MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); -static bool test_no_idle_hz = true; -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); -static char *torture_type = "rcu"; -module_param(torture_type, charp, 0444); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); -static bool verbose; -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); - -#define TORTURE_FLAG "-torture:" -#define PRINTK_STRING(s) \ - do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_STRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) -#define VERBOSE_PRINTK_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) - -static char printk_buf[4096]; - -static int nrealreaders; -static struct task_struct *writer_task; -static struct task_struct **fakewriter_tasks; -static struct task_struct **reader_tasks; -static struct task_struct *stats_task; -static struct task_struct *shuffler_task; -static struct task_struct *stutter_task; -static struct task_struct *fqs_task; -static struct task_struct *boost_tasks[NR_CPUS]; -static struct task_struct *shutdown_task; -#ifdef CONFIG_HOTPLUG_CPU -static struct task_struct *onoff_task; -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static struct task_struct *stall_task; -static struct task_struct **barrier_cbs_tasks; -static struct task_struct *barrier_task; - -#define RCU_TORTURE_PIPE_LEN 10 - -struct rcu_torture { - struct rcu_head rtort_rcu; - int rtort_pipe_count; - struct list_head rtort_free; - int rtort_mbtest; -}; - -static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture __rcu *rcu_torture_current; -static unsigned long rcu_torture_current_version; -static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; -static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = - { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = - { 0 }; -static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; -static atomic_t n_rcu_torture_alloc; -static atomic_t n_rcu_torture_alloc_fail; -static atomic_t n_rcu_torture_free; -static atomic_t n_rcu_torture_mberror; -static atomic_t n_rcu_torture_error; -static long n_rcu_torture_barrier_error; -static long n_rcu_torture_boost_ktrerror; -static long n_rcu_torture_boost_rterror; -static long n_rcu_torture_boost_failure; -static long n_rcu_torture_boosts; -static long n_rcu_torture_timers; -static long n_offline_attempts; -static long n_offline_successes; -static unsigned long sum_offline; -static int min_offline = -1; -static int max_offline; -static long n_online_attempts; -static long n_online_successes; -static unsigned long sum_online; -static int min_online = -1; -static int max_online; -static long n_barrier_attempts; -static long n_barrier_successes; -static struct list_head rcu_torture_removed; -static cpumask_var_t shuffle_tmp_mask; - -static int stutter_pause_test; - -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; -module_param(rcutorture_runnable, int, 0444); -MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); - -#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) -#define rcu_can_boost() 1 -#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ -#define rcu_can_boost() 0 -#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ - -#ifdef CONFIG_RCU_TRACE -static u64 notrace rcu_trace_clock_local(void) -{ - u64 ts = trace_clock_local(); - unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); - return ts; -} -#else /* #ifdef CONFIG_RCU_TRACE */ -static u64 notrace rcu_trace_clock_local(void) -{ - return 0ULL; -} -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - -static unsigned long shutdown_time; /* jiffies to system shutdown. */ -static unsigned long boost_starttime; /* jiffies of next boost test start. */ -DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ - /* and boost task create/destroy. */ -static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ -static bool barrier_phase; /* Test phase. */ -static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ -static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ -static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); - -/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ - -#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ -#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ -#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ -static int fullstop = FULLSTOP_RMMOD; -/* - * Protect fullstop transitions and spawning of kthreads. - */ -static DEFINE_MUTEX(fullstop_mutex); - -/* Forward reference. */ -static void rcu_torture_cleanup(void); - -/* - * Detect and respond to a system shutdown. - */ -static int -rcutorture_shutdown_notify(struct notifier_block *unused1, - unsigned long unused2, void *unused3) -{ - mutex_lock(&fullstop_mutex); - if (fullstop == FULLSTOP_DONTSTOP) - fullstop = FULLSTOP_SHUTDOWN; - else - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - return NOTIFY_DONE; -} - -/* - * Absorb kthreads into a kernel function that won't return, so that - * they won't ever access module text or data again. - */ -static void rcutorture_shutdown_absorb(const char *title) -{ - if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { - pr_notice( - "rcutorture thread %s parking due to system shutdown\n", - title); - schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); - } -} - -/* - * Allocate an element from the rcu_tortures pool. - */ -static struct rcu_torture * -rcu_torture_alloc(void) -{ - struct list_head *p; - - spin_lock_bh(&rcu_torture_lock); - if (list_empty(&rcu_torture_freelist)) { - atomic_inc(&n_rcu_torture_alloc_fail); - spin_unlock_bh(&rcu_torture_lock); - return NULL; - } - atomic_inc(&n_rcu_torture_alloc); - p = rcu_torture_freelist.next; - list_del_init(p); - spin_unlock_bh(&rcu_torture_lock); - return container_of(p, struct rcu_torture, rtort_free); -} - -/* - * Free an element to the rcu_tortures pool. - */ -static void -rcu_torture_free(struct rcu_torture *p) -{ - atomic_inc(&n_rcu_torture_free); - spin_lock_bh(&rcu_torture_lock); - list_add_tail(&p->rtort_free, &rcu_torture_freelist); - spin_unlock_bh(&rcu_torture_lock); -} - -struct rcu_random_state { - unsigned long rrs_state; - long rrs_count; -}; - -#define RCU_RANDOM_MULT 39916801 /* prime */ -#define RCU_RANDOM_ADD 479001701 /* prime */ -#define RCU_RANDOM_REFRESH 10000 - -#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } - -/* - * Crude but fast random-number generator. Uses a linear congruential - * generator, with occasional help from cpu_clock(). - */ -static unsigned long -rcu_random(struct rcu_random_state *rrsp) -{ - if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += (unsigned long)local_clock(); - rrsp->rrs_count = RCU_RANDOM_REFRESH; - } - rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; - return swahw32(rrsp->rrs_state); -} - -static void -rcu_stutter_wait(const char *title) -{ - while (stutter_pause_test || !rcutorture_runnable) { - if (rcutorture_runnable) - schedule_timeout_interruptible(1); - else - schedule_timeout_interruptible(round_jiffies_relative(HZ)); - rcutorture_shutdown_absorb(title); - } -} - -/* - * Operations vector for selecting different types of tests. - */ - -struct rcu_torture_ops { - void (*init)(void); - int (*readlock)(void); - void (*read_delay)(struct rcu_random_state *rrsp); - void (*readunlock)(int idx); - int (*completed)(void); - void (*deferred_free)(struct rcu_torture *p); - void (*sync)(void); - void (*exp_sync)(void); - void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); - void (*cb_barrier)(void); - void (*fqs)(void); - int (*stats)(char *page); - int irq_capable; - int can_boost; - const char *name; -}; - -static struct rcu_torture_ops *cur_ops; - -/* - * Definitions for rcu torture testing. - */ - -static int rcu_torture_read_lock(void) __acquires(RCU) -{ - rcu_read_lock(); - return 0; -} - -static void rcu_read_delay(struct rcu_random_state *rrsp) -{ - const unsigned long shortdelay_us = 200; - const unsigned long longdelay_ms = 50; - - /* We want a short delay sometimes to make a reader delay the grace - * period, and we want a long delay occasionally to trigger - * force_quiescent_state. */ - - if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) - mdelay(longdelay_ms); - if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) - udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT - if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) - preempt_schedule(); /* No QS if preempt_disable() in effect */ -#endif -} - -static void rcu_torture_read_unlock(int idx) __releases(RCU) -{ - rcu_read_unlock(); -} - -static int rcu_torture_completed(void) -{ - return rcu_batches_completed(); -} - -static void -rcu_torture_cb(struct rcu_head *p) -{ - int i; - struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); - - if (fullstop != FULLSTOP_DONTSTOP) { - /* Test is ending, just drop callbacks on the floor. */ - /* The next initialization will pick up the pieces. */ - return; - } - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - rcu_torture_free(rp); - } else { - cur_ops->deferred_free(rp); - } -} - -static int rcu_no_completed(void) -{ - return 0; -} - -static void rcu_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu(&p->rtort_rcu, rcu_torture_cb); -} - -static void rcu_sync_torture_init(void) -{ - INIT_LIST_HEAD(&rcu_torture_removed); -} - -static struct rcu_torture_ops rcu_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferred_free = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .exp_sync = synchronize_rcu_expedited, - .call = call_rcu, - .cb_barrier = rcu_barrier, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .name = "rcu" -}; - -/* - * Definitions for rcu_bh torture testing. - */ - -static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) -{ - rcu_read_lock_bh(); - return 0; -} - -static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) -{ - rcu_read_unlock_bh(); -} - -static int rcu_bh_torture_completed(void) -{ - return rcu_batches_completed_bh(); -} - -static void rcu_bh_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops rcu_bh_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferred_free = rcu_bh_torture_deferred_free, - .sync = synchronize_rcu_bh, - .exp_sync = synchronize_rcu_bh_expedited, - .call = call_rcu_bh, - .cb_barrier = rcu_barrier_bh, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "rcu_bh" -}; - -/* - * Definitions for srcu torture testing. - */ - -DEFINE_STATIC_SRCU(srcu_ctl); - -static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) -{ - return srcu_read_lock(&srcu_ctl); -} - -static void srcu_read_delay(struct rcu_random_state *rrsp) -{ - long delay; - const long uspertick = 1000000 / HZ; - const long longdelay = 10; - - /* We want there to be long-running readers, but not all the time. */ - - delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay) - schedule_timeout_interruptible(longdelay); - else - rcu_read_delay(rrsp); -} - -static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) -{ - srcu_read_unlock(&srcu_ctl, idx); -} - -static int srcu_torture_completed(void) -{ - return srcu_batches_completed(&srcu_ctl); -} - -static void srcu_torture_deferred_free(struct rcu_torture *rp) -{ - call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); -} - -static void srcu_torture_synchronize(void) -{ - synchronize_srcu(&srcu_ctl); -} - -static void srcu_torture_call(struct rcu_head *head, - void (*func)(struct rcu_head *head)) -{ - call_srcu(&srcu_ctl, head, func); -} - -static void srcu_torture_barrier(void) -{ - srcu_barrier(&srcu_ctl); -} - -static int srcu_torture_stats(char *page) -{ - int cnt = 0; - int cpu; - int idx = srcu_ctl.completed & 0x1; - - cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); - for_each_possible_cpu(cpu) { - cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], - per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); - } - cnt += sprintf(&page[cnt], "\n"); - return cnt; -} - -static void srcu_torture_synchronize_expedited(void) -{ - synchronize_srcu_expedited(&srcu_ctl); -} - -static struct rcu_torture_ops srcu_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferred_free = srcu_torture_deferred_free, - .sync = srcu_torture_synchronize, - .exp_sync = srcu_torture_synchronize_expedited, - .call = srcu_torture_call, - .cb_barrier = srcu_torture_barrier, - .stats = srcu_torture_stats, - .name = "srcu" -}; - -/* - * Definitions for sched torture testing. - */ - -static int sched_torture_read_lock(void) -{ - preempt_disable(); - return 0; -} - -static void sched_torture_read_unlock(int idx) -{ - preempt_enable(); -} - -static void rcu_sched_torture_deferred_free(struct rcu_torture *p) -{ - call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); -} - -static struct rcu_torture_ops sched_ops = { - .init = rcu_sync_torture_init, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sched_torture_deferred_free, - .sync = synchronize_sched, - .exp_sync = synchronize_sched_expedited, - .call = call_rcu_sched, - .cb_barrier = rcu_barrier_sched, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "sched" -}; - -/* - * RCU torture priority-boost testing. Runs one real-time thread per - * CPU for moderate bursts, repeatedly registering RCU callbacks and - * spinning waiting for them to be invoked. If a given callback takes - * too long to be invoked, we assume that priority inversion has occurred. - */ - -struct rcu_boost_inflight { - struct rcu_head rcu; - int inflight; -}; - -static void rcu_torture_boost_cb(struct rcu_head *head) -{ - struct rcu_boost_inflight *rbip = - container_of(head, struct rcu_boost_inflight, rcu); - - smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ - rbip->inflight = 0; -} - -static int rcu_torture_boost(void *arg) -{ - unsigned long call_rcu_time; - unsigned long endtime; - unsigned long oldstarttime; - struct rcu_boost_inflight rbi = { .inflight = 0 }; - struct sched_param sp; - - VERBOSE_PRINTK_STRING("rcu_torture_boost started"); - - /* Set real-time priority. */ - sp.sched_priority = 1; - if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { - VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); - n_rcu_torture_boost_rterror++; - } - - init_rcu_head_on_stack(&rbi.rcu); - /* Each pass through the following loop does one boost-test cycle. */ - do { - /* Wait for the next test interval. */ - oldstarttime = boost_starttime; - while (ULONG_CMP_LT(jiffies, oldstarttime)) { - schedule_timeout_interruptible(oldstarttime - jiffies); - rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) - goto checkwait; - } - - /* Do one boost-test interval. */ - endtime = oldstarttime + test_boost_duration * HZ; - call_rcu_time = jiffies; - while (ULONG_CMP_LT(jiffies, endtime)) { - /* If we don't have a callback in flight, post one. */ - if (!rbi.inflight) { - smp_mb(); /* RCU core before ->inflight = 1. */ - rbi.inflight = 1; - call_rcu(&rbi.rcu, rcu_torture_boost_cb); - if (jiffies - call_rcu_time > - test_boost_duration * HZ - HZ / 2) { - VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); - n_rcu_torture_boost_failure++; - } - call_rcu_time = jiffies; - } - cond_resched(); - rcu_stutter_wait("rcu_torture_boost"); - if (kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP) - goto checkwait; - } - - /* - * Set the start time of the next test interval. - * Yes, this is vulnerable to long delays, but such - * delays simply cause a false negative for the next - * interval. Besides, we are running at RT priority, - * so delays should be relatively rare. - */ - while (oldstarttime == boost_starttime && - !kthread_should_stop()) { - if (mutex_trylock(&boost_mutex)) { - boost_starttime = jiffies + - test_boost_interval * HZ; - n_rcu_torture_boosts++; - mutex_unlock(&boost_mutex); - break; - } - schedule_timeout_uninterruptible(1); - } - - /* Go do the stutter. */ -checkwait: rcu_stutter_wait("rcu_torture_boost"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - - /* Clean up and exit. */ - VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); - rcutorture_shutdown_absorb("rcu_torture_boost"); - while (!kthread_should_stop() || rbi.inflight) - schedule_timeout_uninterruptible(1); - smp_mb(); /* order accesses to ->inflight before stack-frame death. */ - destroy_rcu_head_on_stack(&rbi.rcu); - return 0; -} - -/* - * RCU torture force-quiescent-state kthread. Repeatedly induces - * bursts of calls to force_quiescent_state(), increasing the probability - * of occurrence of some important types of race conditions. - */ -static int -rcu_torture_fqs(void *arg) -{ - unsigned long fqs_resume_time; - int fqs_burst_remaining; - - VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); - do { - fqs_resume_time = jiffies + fqs_stutter * HZ; - while (ULONG_CMP_LT(jiffies, fqs_resume_time) && - !kthread_should_stop()) { - schedule_timeout_interruptible(1); - } - fqs_burst_remaining = fqs_duration; - while (fqs_burst_remaining > 0 && - !kthread_should_stop()) { - cur_ops->fqs(); - udelay(fqs_holdoff); - fqs_burst_remaining -= fqs_holdoff; - } - rcu_stutter_wait("rcu_torture_fqs"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fqs"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * RCU torture writer kthread. Repeatedly substitutes a new structure - * for that pointed to by rcu_torture_current, freeing the old structure - * after a series of grace periods (the "pipeline"). - */ -static int -rcu_torture_writer(void *arg) -{ - bool exp; - int i; - struct rcu_torture *rp; - struct rcu_torture *rp1; - struct rcu_torture *old_rp; - static DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); - set_user_nice(current, 19); - - do { - schedule_timeout_uninterruptible(1); - rp = rcu_torture_alloc(); - if (rp == NULL) - continue; - rp->rtort_pipe_count = 0; - udelay(rcu_random(&rand) & 0x3ff); - old_rp = rcu_dereference_check(rcu_torture_current, - current == writer_task); - rp->rtort_mbtest = 1; - rcu_assign_pointer(rcu_torture_current, rp); - smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ - if (old_rp) { - i = old_rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - old_rp->rtort_pipe_count++; - if (gp_normal == gp_exp) - exp = !!(rcu_random(&rand) & 0x80); - else - exp = gp_exp; - if (!exp) { - cur_ops->deferred_free(old_rp); - } else { - cur_ops->exp_sync(); - list_add(&old_rp->rtort_free, - &rcu_torture_removed); - list_for_each_entry_safe(rp, rp1, - &rcu_torture_removed, - rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= - RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } - } - } - rcutorture_record_progress(++rcu_torture_current_version); - rcu_stutter_wait("rcu_torture_writer"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); - rcutorture_shutdown_absorb("rcu_torture_writer"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * RCU torture fake writer kthread. Repeatedly calls sync, with a random - * delay between calls. - */ -static int -rcu_torture_fakewriter(void *arg) -{ - DEFINE_RCU_RANDOM(rand); - - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); - set_user_nice(current, 19); - - do { - schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); - udelay(rcu_random(&rand) & 0x3ff); - if (cur_ops->cb_barrier != NULL && - rcu_random(&rand) % (nfakewriters * 8) == 0) { - cur_ops->cb_barrier(); - } else if (gp_normal == gp_exp) { - if (rcu_random(&rand) & 0x80) - cur_ops->sync(); - else - cur_ops->exp_sync(); - } else if (gp_normal) { - cur_ops->sync(); - } else { - cur_ops->exp_sync(); - } - rcu_stutter_wait("rcu_torture_fakewriter"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - - VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); - rcutorture_shutdown_absorb("rcu_torture_fakewriter"); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -void rcutorture_trace_dump(void) -{ - static atomic_t beenhere = ATOMIC_INIT(0); - - if (atomic_read(&beenhere)) - return; - if (atomic_xchg(&beenhere, 1) != 0) - return; - ftrace_dump(DUMP_ALL); -} - -/* - * RCU torture reader from timer handler. Dereferences rcu_torture_current, - * incrementing the corresponding element of the pipeline array. The - * counter in the element should never be greater than 1, otherwise, the - * RCU implementation is broken. - */ -static void rcu_torture_timer(unsigned long unused) -{ - int idx; - int completed; - int completed_end; - static DEFINE_RCU_RANDOM(rand); - static DEFINE_SPINLOCK(rand_lock); - struct rcu_torture *p; - int pipe_count; - unsigned long long ts; - - idx = cur_ops->readlock(); - completed = cur_ops->completed(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(&srcu_ctl)); - if (p == NULL) { - /* Leave because rcu_torture_writer is not yet underway */ - cur_ops->readunlock(idx); - return; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - spin_lock(&rand_lock); - cur_ops->read_delay(&rand); - n_rcu_torture_timers++; - spin_unlock(&rand_lock); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed_end = cur_ops->completed(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, - completed, completed_end); - rcutorture_trace_dump(); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); -} - -/* - * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, - * incrementing the corresponding element of the pipeline array. The - * counter in the element should never be greater than 1, otherwise, the - * RCU implementation is broken. - */ -static int -rcu_torture_reader(void *arg) -{ - int completed; - int completed_end; - int idx; - DEFINE_RCU_RANDOM(rand); - struct rcu_torture *p; - int pipe_count; - struct timer_list t; - unsigned long long ts; - - VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); - set_user_nice(current, 19); - if (irqreader && cur_ops->irq_capable) - setup_timer_on_stack(&t, rcu_torture_timer, 0); - - do { - if (irqreader && cur_ops->irq_capable) { - if (!timer_pending(&t)) - mod_timer(&t, jiffies + 1); - } - idx = cur_ops->readlock(); - completed = cur_ops->completed(); - ts = rcu_trace_clock_local(); - p = rcu_dereference_check(rcu_torture_current, - rcu_read_lock_bh_held() || - rcu_read_lock_sched_held() || - srcu_read_lock_held(&srcu_ctl)); - if (p == NULL) { - /* Wait for rcu_torture_writer to get underway */ - cur_ops->readunlock(idx); - schedule_timeout_interruptible(HZ); - continue; - } - if (p->rtort_mbtest == 0) - atomic_inc(&n_rcu_torture_mberror); - cur_ops->read_delay(&rand); - preempt_disable(); - pipe_count = p->rtort_pipe_count; - if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - pipe_count = RCU_TORTURE_PIPE_LEN; - } - completed_end = cur_ops->completed(); - if (pipe_count > 1) { - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, - ts, completed, completed_end); - rcutorture_trace_dump(); - } - __this_cpu_inc(rcu_torture_count[pipe_count]); - completed = completed_end - completed; - if (completed > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ - completed = RCU_TORTURE_PIPE_LEN; - } - __this_cpu_inc(rcu_torture_batch[completed]); - preempt_enable(); - cur_ops->readunlock(idx); - schedule(); - rcu_stutter_wait("rcu_torture_reader"); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); - rcutorture_shutdown_absorb("rcu_torture_reader"); - if (irqreader && cur_ops->irq_capable) - del_timer_sync(&t); - while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); - return 0; -} - -/* - * Create an RCU-torture statistics message in the specified buffer. - */ -static int -rcu_torture_printk(char *page) -{ - int cnt = 0; - int cpu; - int i; - long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; - - for_each_possible_cpu(cpu) { - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; - batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; - } - } - for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { - if (pipesummary[i] != 0) - break; - } - cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], - "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", - rcu_torture_current, - rcu_torture_current_version, - list_empty(&rcu_torture_freelist), - atomic_read(&n_rcu_torture_alloc), - atomic_read(&n_rcu_torture_alloc_fail), - atomic_read(&n_rcu_torture_free)); - cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", - atomic_read(&n_rcu_torture_mberror), - n_rcu_torture_boost_ktrerror, - n_rcu_torture_boost_rterror); - cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", - n_rcu_torture_boost_failure, - n_rcu_torture_boosts, - n_rcu_torture_timers); - cnt += sprintf(&page[cnt], - "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", - n_online_successes, n_online_attempts, - n_offline_successes, n_offline_attempts, - min_online, max_online, - min_offline, max_offline, - sum_online, sum_offline, HZ); - cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - if (atomic_read(&n_rcu_torture_mberror) != 0 || - n_rcu_torture_barrier_error != 0 || - n_rcu_torture_boost_ktrerror != 0 || - n_rcu_torture_boost_rterror != 0 || - n_rcu_torture_boost_failure != 0 || - i > 1) { - cnt += sprintf(&page[cnt], "!!! "); - atomic_inc(&n_rcu_torture_error); - WARN_ON_ONCE(1); - } - cnt += sprintf(&page[cnt], "Reader Pipe: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Reader Batch: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); - cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); - cnt += sprintf(&page[cnt], "Free-Block Circulation: "); - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - cnt += sprintf(&page[cnt], " %d", - atomic_read(&rcu_torture_wcount[i])); - } - cnt += sprintf(&page[cnt], "\n"); - if (cur_ops->stats) - cnt += cur_ops->stats(&page[cnt]); - return cnt; -} - -/* - * Print torture statistics. Caller must ensure that there is only - * one call to this function at a given time!!! This is normally - * accomplished by relying on the module system to only have one copy - * of the module loaded, and then by giving the rcu_torture_stats - * kthread full control (or the init/cleanup functions when rcu_torture_stats - * thread is not running). - */ -static void -rcu_torture_stats_print(void) -{ - int cnt; - - cnt = rcu_torture_printk(printk_buf); - pr_alert("%s", printk_buf); -} - -/* - * Periodically prints torture statistics, if periodic statistics printing - * was specified via the stat_interval module parameter. - * - * No need to worry about fullstop here, since this one doesn't reference - * volatile state or register callbacks. - */ -static int -rcu_torture_stats(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); - do { - schedule_timeout_interruptible(stat_interval * HZ); - rcu_torture_stats_print(); - rcutorture_shutdown_absorb("rcu_torture_stats"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); - return 0; -} - -static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ - -/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case - * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. - */ -static void rcu_torture_shuffle_tasks(void) -{ - int i; - - cpumask_setall(shuffle_tmp_mask); - get_online_cpus(); - - /* No point in shuffling if there is only one online CPU (ex: UP) */ - if (num_online_cpus() == 1) { - put_online_cpus(); - return; - } - - if (rcu_idle_cpu != -1) - cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); - - set_cpus_allowed_ptr(current, shuffle_tmp_mask); - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) - if (reader_tasks[i]) - set_cpus_allowed_ptr(reader_tasks[i], - shuffle_tmp_mask); - } - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) - if (fakewriter_tasks[i]) - set_cpus_allowed_ptr(fakewriter_tasks[i], - shuffle_tmp_mask); - } - if (writer_task) - set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); - if (stats_task) - set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); - if (stutter_task) - set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); - if (fqs_task) - set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); - if (shutdown_task) - set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); -#ifdef CONFIG_HOTPLUG_CPU - if (onoff_task) - set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - if (stall_task) - set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); - if (barrier_cbs_tasks) - for (i = 0; i < n_barrier_cbs; i++) - if (barrier_cbs_tasks[i]) - set_cpus_allowed_ptr(barrier_cbs_tasks[i], - shuffle_tmp_mask); - if (barrier_task) - set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); - - if (rcu_idle_cpu == -1) - rcu_idle_cpu = num_online_cpus() - 1; - else - rcu_idle_cpu--; - - put_online_cpus(); -} - -/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the - * system to become idle at a time and cut off its timer ticks. This is meant - * to test the support for such tickless idle CPU in RCU. - */ -static int -rcu_torture_shuffle(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); - do { - schedule_timeout_interruptible(shuffle_interval * HZ); - rcu_torture_shuffle_tasks(); - rcutorture_shutdown_absorb("rcu_torture_shuffle"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); - return 0; -} - -/* Cause the rcutorture test to "stutter", starting and stopping all - * threads periodically. - */ -static int -rcu_torture_stutter(void *arg) -{ - VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); - do { - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 1; - if (!kthread_should_stop()) - schedule_timeout_interruptible(stutter * HZ); - stutter_pause_test = 0; - rcutorture_shutdown_absorb("rcu_torture_stutter"); - } while (!kthread_should_stop()); - VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); - return 0; -} - -static inline void -rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) -{ - pr_alert("%s" TORTURE_FLAG - "--- %s: nreaders=%d nfakewriters=%d " - "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d irqreader=%d " - "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " - "test_boost=%d/%d test_boost_interval=%d " - "test_boost_duration=%d shutdown_secs=%d " - "stall_cpu=%d stall_cpu_holdoff=%d " - "n_barrier_cbs=%d " - "onoff_interval=%d onoff_holdoff=%d\n", - torture_type, tag, nrealreaders, nfakewriters, - stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, - test_boost, cur_ops->can_boost, - test_boost_interval, test_boost_duration, shutdown_secs, - stall_cpu, stall_cpu_holdoff, - n_barrier_cbs, - onoff_interval, onoff_holdoff); -} - -static struct notifier_block rcutorture_shutdown_nb = { - .notifier_call = rcutorture_shutdown_notify, -}; - -static void rcutorture_booster_cleanup(int cpu) -{ - struct task_struct *t; - - if (boost_tasks[cpu] == NULL) - return; - mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); - t = boost_tasks[cpu]; - boost_tasks[cpu] = NULL; - mutex_unlock(&boost_mutex); - - /* This must be outside of the mutex, otherwise deadlock! */ - kthread_stop(t); - boost_tasks[cpu] = NULL; -} - -static int rcutorture_booster_init(int cpu) -{ - int retval; - - if (boost_tasks[cpu] != NULL) - return 0; /* Already created, nothing more to do. */ - - /* Don't allow time recalculation while creating a new task. */ - mutex_lock(&boost_mutex); - VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, - cpu_to_node(cpu), - "rcu_torture_boost"); - if (IS_ERR(boost_tasks[cpu])) { - retval = PTR_ERR(boost_tasks[cpu]); - VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); - n_rcu_torture_boost_ktrerror++; - boost_tasks[cpu] = NULL; - mutex_unlock(&boost_mutex); - return retval; - } - kthread_bind(boost_tasks[cpu], cpu); - wake_up_process(boost_tasks[cpu]); - mutex_unlock(&boost_mutex); - return 0; -} - -/* - * Cause the rcutorture test to shutdown the system after the test has - * run for the time specified by the shutdown_secs module parameter. - */ -static int -rcu_torture_shutdown(void *arg) -{ - long delta; - unsigned long jiffies_snap; - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); - jiffies_snap = ACCESS_ONCE(jiffies); - while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && - !kthread_should_stop()) { - delta = shutdown_time - jiffies_snap; - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_shutdown task: %lu jiffies remaining\n", - torture_type, delta); - schedule_timeout_interruptible(delta); - jiffies_snap = ACCESS_ONCE(jiffies); - } - if (kthread_should_stop()) { - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); - return 0; - } - - /* OK, shut down the system. */ - - VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); - shutdown_task = NULL; /* Avoid self-kill deadlock. */ - rcu_torture_cleanup(); /* Get the success/failure message. */ - kernel_power_off(); /* Shut down the system. */ - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Execute random CPU-hotplug operations at the interval specified - * by the onoff_interval. - */ -static int -rcu_torture_onoff(void *arg) -{ - int cpu; - unsigned long delta; - int maxcpu = -1; - DEFINE_RCU_RANDOM(rand); - int ret; - unsigned long starttime; - - VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); - for_each_online_cpu(cpu) - maxcpu = cpu; - WARN_ON(maxcpu < 0); - if (onoff_holdoff > 0) { - VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); - schedule_timeout_interruptible(onoff_holdoff * HZ); - VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); - } - while (!kthread_should_stop()) { - cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "rcu_torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } - schedule_timeout_interruptible(onoff_interval * HZ); - } - VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); - return 0; -} - -static int -rcu_torture_onoff_init(void) -{ - int ret; - - if (onoff_interval <= 0) - return 0; - onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); - if (IS_ERR(onoff_task)) { - ret = PTR_ERR(onoff_task); - onoff_task = NULL; - return ret; - } - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ - if (onoff_task == NULL) - return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); - kthread_stop(onoff_task); - onoff_task = NULL; -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static int -rcu_torture_onoff_init(void) -{ - return 0; -} - -static void rcu_torture_onoff_cleanup(void) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - -/* - * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then - * induces a CPU stall for the time specified by stall_cpu. - */ -static int rcu_torture_stall(void *args) -{ - unsigned long stop_at; - - VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); - if (stall_cpu_holdoff > 0) { - VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); - schedule_timeout_interruptible(stall_cpu_holdoff * HZ); - VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); - } - if (!kthread_should_stop()) { - stop_at = get_seconds() + stall_cpu; - /* RCU CPU stall is expected behavior in following code. */ - pr_alert("rcu_torture_stall start.\n"); - rcu_read_lock(); - preempt_disable(); - while (ULONG_CMP_LT(get_seconds(), stop_at)) - continue; /* Induce RCU CPU stall warning. */ - preempt_enable(); - rcu_read_unlock(); - pr_alert("rcu_torture_stall end.\n"); - } - rcutorture_shutdown_absorb("rcu_torture_stall"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(10 * HZ); - return 0; -} - -/* Spawn CPU-stall kthread, if stall_cpu specified. */ -static int __init rcu_torture_stall_init(void) -{ - int ret; - - if (stall_cpu <= 0) - return 0; - stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); - if (IS_ERR(stall_task)) { - ret = PTR_ERR(stall_task); - stall_task = NULL; - return ret; - } - return 0; -} - -/* Clean up after the CPU-stall kthread, if one was spawned. */ -static void rcu_torture_stall_cleanup(void) -{ - if (stall_task == NULL) - return; - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); - kthread_stop(stall_task); - stall_task = NULL; -} - -/* Callback function for RCU barrier testing. */ -void rcu_torture_barrier_cbf(struct rcu_head *rcu) -{ - atomic_inc(&barrier_cbs_invoked); -} - -/* kthread function to register callbacks used to test RCU barriers. */ -static int rcu_torture_barrier_cbs(void *arg) -{ - long myid = (long)arg; - bool lastphase = 0; - struct rcu_head rcu; - - init_rcu_head_on_stack(&rcu); - VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); - set_user_nice(current, 19); - do { - wait_event(barrier_cbs_wq[myid], - barrier_phase != lastphase || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); - lastphase = barrier_phase; - smp_mb(); /* ensure barrier_phase load before ->call(). */ - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) - break; - cur_ops->call(&rcu, rcu_torture_barrier_cbf); - if (atomic_dec_and_test(&barrier_cbs_count)) - wake_up(&barrier_wq); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); - cur_ops->cb_barrier(); - destroy_rcu_head_on_stack(&rcu); - return 0; -} - -/* kthread function to drive and coordinate RCU barrier testing. */ -static int rcu_torture_barrier(void *arg) -{ - int i; - - VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); - do { - atomic_set(&barrier_cbs_invoked, 0); - atomic_set(&barrier_cbs_count, n_barrier_cbs); - smp_mb(); /* Ensure barrier_phase after prior assignments. */ - barrier_phase = !barrier_phase; - for (i = 0; i < n_barrier_cbs; i++) - wake_up(&barrier_cbs_wq[i]); - wait_event(barrier_wq, - atomic_read(&barrier_cbs_count) == 0 || - kthread_should_stop() || - fullstop != FULLSTOP_DONTSTOP); - if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) - break; - n_barrier_attempts++; - cur_ops->cb_barrier(); - if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { - n_rcu_torture_barrier_error++; - WARN_ON_ONCE(1); - } - n_barrier_successes++; - schedule_timeout_interruptible(HZ / 10); - } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); - VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); - rcutorture_shutdown_absorb("rcu_torture_barrier"); - while (!kthread_should_stop()) - schedule_timeout_interruptible(1); - return 0; -} - -/* Initialize RCU barrier testing. */ -static int rcu_torture_barrier_init(void) -{ - int i; - int ret; - - if (n_barrier_cbs == 0) - return 0; - if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { - pr_alert("%s" TORTURE_FLAG - " Call or barrier ops missing for %s,\n", - torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " RCU barrier testing omitted from run.\n", - torture_type); - return 0; - } - atomic_set(&barrier_cbs_count, 0); - atomic_set(&barrier_cbs_invoked, 0); - barrier_cbs_tasks = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), - GFP_KERNEL); - barrier_cbs_wq = - kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), - GFP_KERNEL); - if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) - return -ENOMEM; - for (i = 0; i < n_barrier_cbs; i++) { - init_waitqueue_head(&barrier_cbs_wq[i]); - barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, - (void *)(long)i, - "rcu_torture_barrier_cbs"); - if (IS_ERR(barrier_cbs_tasks[i])) { - ret = PTR_ERR(barrier_cbs_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); - barrier_cbs_tasks[i] = NULL; - return ret; - } - } - barrier_task = kthread_run(rcu_torture_barrier, NULL, - "rcu_torture_barrier"); - if (IS_ERR(barrier_task)) { - ret = PTR_ERR(barrier_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); - barrier_task = NULL; - } - return 0; -} - -/* Clean up after RCU barrier testing. */ -static void rcu_torture_barrier_cleanup(void) -{ - int i; - - if (barrier_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); - kthread_stop(barrier_task); - barrier_task = NULL; - } - if (barrier_cbs_tasks != NULL) { - for (i = 0; i < n_barrier_cbs; i++) { - if (barrier_cbs_tasks[i] != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); - kthread_stop(barrier_cbs_tasks[i]); - barrier_cbs_tasks[i] = NULL; - } - } - kfree(barrier_cbs_tasks); - barrier_cbs_tasks = NULL; - } - if (barrier_cbs_wq != NULL) { - kfree(barrier_cbs_wq); - barrier_cbs_wq = NULL; - } -} - -static int rcutorture_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - (void)rcutorture_booster_init(cpu); - break; - case CPU_DOWN_PREPARE: - rcutorture_booster_cleanup(cpu); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block rcutorture_cpu_nb = { - .notifier_call = rcutorture_cpu_notify, -}; - -static void -rcu_torture_cleanup(void) -{ - int i; - - mutex_lock(&fullstop_mutex); - rcutorture_record_test_transition(); - if (fullstop == FULLSTOP_SHUTDOWN) { - pr_warn(/* but going down anyway, so... */ - "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); - mutex_unlock(&fullstop_mutex); - schedule_timeout_uninterruptible(10); - if (cur_ops->cb_barrier != NULL) - cur_ops->cb_barrier(); - return; - } - fullstop = FULLSTOP_RMMOD; - mutex_unlock(&fullstop_mutex); - unregister_reboot_notifier(&rcutorture_shutdown_nb); - rcu_torture_barrier_cleanup(); - rcu_torture_stall_cleanup(); - if (stutter_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); - kthread_stop(stutter_task); - } - stutter_task = NULL; - if (shuffler_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); - kthread_stop(shuffler_task); - free_cpumask_var(shuffle_tmp_mask); - } - shuffler_task = NULL; - - if (writer_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); - kthread_stop(writer_task); - } - writer_task = NULL; - - if (reader_tasks) { - for (i = 0; i < nrealreaders; i++) { - if (reader_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_reader task"); - kthread_stop(reader_tasks[i]); - } - reader_tasks[i] = NULL; - } - kfree(reader_tasks); - reader_tasks = NULL; - } - rcu_torture_current = NULL; - - if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) { - if (fakewriter_tasks[i]) { - VERBOSE_PRINTK_STRING( - "Stopping rcu_torture_fakewriter task"); - kthread_stop(fakewriter_tasks[i]); - } - fakewriter_tasks[i] = NULL; - } - kfree(fakewriter_tasks); - fakewriter_tasks = NULL; - } - - if (stats_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); - kthread_stop(stats_task); - } - stats_task = NULL; - - if (fqs_task) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); - kthread_stop(fqs_task); - } - fqs_task = NULL; - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { - unregister_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) - rcutorture_booster_cleanup(i); - } - if (shutdown_task != NULL) { - VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); - kthread_stop(shutdown_task); - } - shutdown_task = NULL; - rcu_torture_onoff_cleanup(); - - /* Wait for all RCU callbacks to fire. */ - - if (cur_ops->cb_barrier != NULL) - cur_ops->cb_barrier(); - - rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ - - if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) - rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); - else if (n_online_successes != n_online_attempts || - n_offline_successes != n_offline_attempts) - rcu_torture_print_module_parms(cur_ops, - "End of test: RCU_HOTPLUG"); - else - rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); -} - -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD -static void rcu_torture_leak_cb(struct rcu_head *rhp) -{ -} - -static void rcu_torture_err_cb(struct rcu_head *rhp) -{ - /* - * This -might- happen due to race conditions, but is unlikely. - * The scenario that leads to this happening is that the - * first of the pair of duplicate callbacks is queued, - * someone else starts a grace period that includes that - * callback, then the second of the pair must wait for the - * next grace period. Unlikely, but can happen. If it - * does happen, the debug-objects subsystem won't have splatted. - */ - pr_alert("rcutorture: duplicated callback was invoked.\n"); -} -#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - -/* - * Verify that double-free causes debug-objects to complain, but only - * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test - * cannot be carried out. - */ -static void rcu_test_debug_objects(void) -{ -#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD - struct rcu_head rh1; - struct rcu_head rh2; - - init_rcu_head_on_stack(&rh1); - init_rcu_head_on_stack(&rh2); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); - - /* Try to queue the rh2 pair of callbacks for the same grace period. */ - preempt_disable(); /* Prevent preemption from interrupting test. */ - rcu_read_lock(); /* Make it impossible to finish a grace period. */ - call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ - local_irq_disable(); /* Make it harder to start a new grace period. */ - call_rcu(&rh2, rcu_torture_leak_cb); - call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ - local_irq_enable(); - rcu_read_unlock(); - preempt_enable(); - - /* Wait for them all to get done so we can safely return. */ - rcu_barrier(); - pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); - destroy_rcu_head_on_stack(&rh1); - destroy_rcu_head_on_stack(&rh2); -#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ - pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); -#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -} - -static int __init -rcu_torture_init(void) -{ - int i; - int cpu; - int firsterr = 0; - int retval; - static struct rcu_torture_ops *torture_ops[] = { - &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, - }; - - mutex_lock(&fullstop_mutex); - - /* Process args and tell the world that the torturer is on the job. */ - for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { - cur_ops = torture_ops[i]; - if (strcmp(torture_type, cur_ops->name) == 0) - break; - } - if (i == ARRAY_SIZE(torture_ops)) { - pr_alert("rcu-torture: invalid torture type: \"%s\"\n", - torture_type); - pr_alert("rcu-torture types:"); - for (i = 0; i < ARRAY_SIZE(torture_ops); i++) - pr_alert(" %s", torture_ops[i]->name); - pr_alert("\n"); - mutex_unlock(&fullstop_mutex); - return -EINVAL; - } - if (cur_ops->fqs == NULL && fqs_duration != 0) { - pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); - fqs_duration = 0; - } - if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ - - if (nreaders >= 0) - nrealreaders = nreaders; - else - nrealreaders = 2 * num_online_cpus(); - rcu_torture_print_module_parms(cur_ops, "Start of test"); - fullstop = FULLSTOP_DONTSTOP; - - /* Set up the freelist. */ - - INIT_LIST_HEAD(&rcu_torture_freelist); - for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { - rcu_tortures[i].rtort_mbtest = 0; - list_add_tail(&rcu_tortures[i].rtort_free, - &rcu_torture_freelist); - } - - /* Initialize the statistics so that each run gets its own numbers. */ - - rcu_torture_current = NULL; - rcu_torture_current_version = 0; - atomic_set(&n_rcu_torture_alloc, 0); - atomic_set(&n_rcu_torture_alloc_fail, 0); - atomic_set(&n_rcu_torture_free, 0); - atomic_set(&n_rcu_torture_mberror, 0); - atomic_set(&n_rcu_torture_error, 0); - n_rcu_torture_barrier_error = 0; - n_rcu_torture_boost_ktrerror = 0; - n_rcu_torture_boost_rterror = 0; - n_rcu_torture_boost_failure = 0; - n_rcu_torture_boosts = 0; - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) - atomic_set(&rcu_torture_wcount[i], 0); - for_each_possible_cpu(cpu) { - for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { - per_cpu(rcu_torture_count, cpu)[i] = 0; - per_cpu(rcu_torture_batch, cpu)[i] = 0; - } - } - - /* Start up the kthreads. */ - - VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); - writer_task = kthread_create(rcu_torture_writer, NULL, - "rcu_torture_writer"); - if (IS_ERR(writer_task)) { - firsterr = PTR_ERR(writer_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); - writer_task = NULL; - goto unwind; - } - wake_up_process(writer_task); - fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); - if (fakewriter_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nfakewriters; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); - fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, - "rcu_torture_fakewriter"); - if (IS_ERR(fakewriter_tasks[i])) { - firsterr = PTR_ERR(fakewriter_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); - fakewriter_tasks[i] = NULL; - goto unwind; - } - } - reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), - GFP_KERNEL); - if (reader_tasks == NULL) { - VERBOSE_PRINTK_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < nrealreaders; i++) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); - reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, - "rcu_torture_reader"); - if (IS_ERR(reader_tasks[i])) { - firsterr = PTR_ERR(reader_tasks[i]); - VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); - reader_tasks[i] = NULL; - goto unwind; - } - } - if (stat_interval > 0) { - VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); - stats_task = kthread_run(rcu_torture_stats, NULL, - "rcu_torture_stats"); - if (IS_ERR(stats_task)) { - firsterr = PTR_ERR(stats_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); - stats_task = NULL; - goto unwind; - } - } - if (test_no_idle_hz) { - rcu_idle_cpu = num_online_cpus() - 1; - - if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { - firsterr = -ENOMEM; - VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); - goto unwind; - } - - /* Create the shuffler thread */ - shuffler_task = kthread_run(rcu_torture_shuffle, NULL, - "rcu_torture_shuffle"); - if (IS_ERR(shuffler_task)) { - free_cpumask_var(shuffle_tmp_mask); - firsterr = PTR_ERR(shuffler_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); - shuffler_task = NULL; - goto unwind; - } - } - if (stutter < 0) - stutter = 0; - if (stutter) { - /* Create the stutter thread */ - stutter_task = kthread_run(rcu_torture_stutter, NULL, - "rcu_torture_stutter"); - if (IS_ERR(stutter_task)) { - firsterr = PTR_ERR(stutter_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); - stutter_task = NULL; - goto unwind; - } - } - if (fqs_duration < 0) - fqs_duration = 0; - if (fqs_duration) { - /* Create the stutter thread */ - fqs_task = kthread_run(rcu_torture_fqs, NULL, - "rcu_torture_fqs"); - if (IS_ERR(fqs_task)) { - firsterr = PTR_ERR(fqs_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); - fqs_task = NULL; - goto unwind; - } - } - if (test_boost_interval < 1) - test_boost_interval = 1; - if (test_boost_duration < 2) - test_boost_duration = 2; - if ((test_boost == 1 && cur_ops->can_boost) || - test_boost == 2) { - - boost_starttime = jiffies + test_boost_interval * HZ; - register_cpu_notifier(&rcutorture_cpu_nb); - for_each_possible_cpu(i) { - if (cpu_is_offline(i)) - continue; /* Heuristic: CPU can go offline. */ - retval = rcutorture_booster_init(i); - if (retval < 0) { - firsterr = retval; - goto unwind; - } - } - } - if (shutdown_secs > 0) { - shutdown_time = jiffies + shutdown_secs * HZ; - shutdown_task = kthread_create(rcu_torture_shutdown, NULL, - "rcu_torture_shutdown"); - if (IS_ERR(shutdown_task)) { - firsterr = PTR_ERR(shutdown_task); - VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); - shutdown_task = NULL; - goto unwind; - } - wake_up_process(shutdown_task); - } - i = rcu_torture_onoff_init(); - if (i != 0) { - firsterr = i; - goto unwind; - } - register_reboot_notifier(&rcutorture_shutdown_nb); - i = rcu_torture_stall_init(); - if (i != 0) { - firsterr = i; - goto unwind; - } - retval = rcu_torture_barrier_init(); - if (retval != 0) { - firsterr = retval; - goto unwind; - } - if (object_debug) - rcu_test_debug_objects(); - rcutorture_record_test_transition(); - mutex_unlock(&fullstop_mutex); - return 0; - -unwind: - mutex_unlock(&fullstop_mutex); - rcu_torture_cleanup(); - return firsterr; -} - -module_init(rcu_torture_init); -module_exit(rcu_torture_cleanup); diff --git a/kernel/rcutree.c b/kernel/rcutree.c deleted file mode 100644 index 240604aa3f70..000000000000 --- a/kernel/rcutree.c +++ /dev/null @@ -1,3396 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2008 - * - * Authors: Dipankar Sarma - * Manfred Spraul - * Paul E. McKenney Hierarchical version - * - * Based on the original work by Paul McKenney - * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rcutree.h" -#include - -#include "rcu.h" - -/* Data structures. */ - -static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - -/* - * In order to export the rcu_state name to the tracing tools, it - * needs to be added in the __tracepoint_string section. - * This requires defining a separate variable tp__varname - * that points to the string being used, and this will allow - * the tracing userspace tools to be able to decipher the string - * address to the matching string. - */ -#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ -static char sname##_varname[] = #sname; \ -static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ -struct rcu_state sname##_state = { \ - .level = { &sname##_state.node[0] }, \ - .call = cr, \ - .fqs_state = RCU_GP_IDLE, \ - .gpnum = 0UL - 300UL, \ - .completed = 0UL - 300UL, \ - .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ - .orphan_nxttail = &sname##_state.orphan_nxtlist, \ - .orphan_donetail = &sname##_state.orphan_donelist, \ - .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ - .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ - .name = sname##_varname, \ - .abbr = sabbr, \ -}; \ -DEFINE_PER_CPU(struct rcu_data, sname##_data) - -RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); -RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); - -static struct rcu_state *rcu_state; -LIST_HEAD(rcu_struct_flavors); - -/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ -static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; -module_param(rcu_fanout_leaf, int, 0444); -int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; -static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ - NUM_RCU_LVL_0, - NUM_RCU_LVL_1, - NUM_RCU_LVL_2, - NUM_RCU_LVL_3, - NUM_RCU_LVL_4, -}; -int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ - -/* - * The rcu_scheduler_active variable transitions from zero to one just - * before the first task is spawned. So when this variable is zero, RCU - * can assume that there is but one task, allowing RCU to (for example) - * optimize synchronize_sched() to a simple barrier(). When this variable - * is one, RCU must actually do all the hard work required to detect real - * grace periods. This variable is also used to suppress boot-time false - * positives from lockdep-RCU error checking. - */ -int rcu_scheduler_active __read_mostly; -EXPORT_SYMBOL_GPL(rcu_scheduler_active); - -/* - * The rcu_scheduler_fully_active variable transitions from zero to one - * during the early_initcall() processing, which is after the scheduler - * is capable of creating new tasks. So RCU processing (for example, - * creating tasks for RCU priority boosting) must be delayed until after - * rcu_scheduler_fully_active transitions from zero to one. We also - * currently delay invocation of any RCU callbacks until after this point. - * - * It might later prove better for people registering RCU callbacks during - * early boot to take responsibility for these callbacks, but one step at - * a time. - */ -static int rcu_scheduler_fully_active __read_mostly; - -#ifdef CONFIG_RCU_BOOST - -/* - * Control variables for per-CPU and per-rcu_node kthreads. These - * handle all flavors of RCU. - */ -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DEFINE_PER_CPU(char, rcu_cpu_has_work); - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); -static void invoke_rcu_core(void); -static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); - -/* - * Track the rcutorture test sequence number and the update version - * number within a given test. The rcutorture_testseq is incremented - * on every rcutorture module load and unload, so has an odd value - * when a test is running. The rcutorture_vernum is set to zero - * when rcutorture starts and is incremented on each rcutorture update. - * These variables enable correlating rcutorture output with the - * RCU tracing information. - */ -unsigned long rcutorture_testseq; -unsigned long rcutorture_vernum; - -/* - * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s - * permit this function to be invoked without holding the root rcu_node - * structure's ->lock, but of course results can be subject to change. - */ -static int rcu_gp_in_progress(struct rcu_state *rsp) -{ - return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); -} - -/* - * Note a quiescent state. Because we do not need to know - * how many quiescent states passed, just if there was at least - * one since the start of the grace period, this just sets a flag. - * The caller must have disabled preemption. - */ -void rcu_sched_qs(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; -} - -void rcu_bh_qs(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; -} - -/* - * Note a context switch. This is a quiescent state for RCU-sched, - * and requires special handling for preemptible RCU. - * The caller must have disabled preemption. - */ -void rcu_note_context_switch(int cpu) -{ - trace_rcu_utilization(TPS("Start context switch")); - rcu_sched_qs(cpu); - rcu_preempt_note_context_switch(cpu); - trace_rcu_utilization(TPS("End context switch")); -} -EXPORT_SYMBOL_GPL(rcu_note_context_switch); - -static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, - .dynticks = ATOMIC_INIT(1), -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, - .dynticks_idle = ATOMIC_INIT(1), -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -}; - -static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ -static long qhimark = 10000; /* If this many pending, ignore blimit. */ -static long qlowmark = 100; /* Once only this many pending, use blimit. */ - -module_param(blimit, long, 0444); -module_param(qhimark, long, 0444); -module_param(qlowmark, long, 0444); - -static ulong jiffies_till_first_fqs = ULONG_MAX; -static ulong jiffies_till_next_fqs = ULONG_MAX; - -module_param(jiffies_till_first_fqs, ulong, 0644); -module_param(jiffies_till_next_fqs, ulong, 0644); - -static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp); -static void force_qs_rnp(struct rcu_state *rsp, - int (*f)(struct rcu_data *rsp, bool *isidle, - unsigned long *maxj), - bool *isidle, unsigned long *maxj); -static void force_quiescent_state(struct rcu_state *rsp); -static int rcu_pending(int cpu); - -/* - * Return the number of RCU-sched batches processed thus far for debug & stats. - */ -long rcu_batches_completed_sched(void) -{ - return rcu_sched_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); - -/* - * Return the number of RCU BH batches processed thus far for debug & stats. - */ -long rcu_batches_completed_bh(void) -{ - return rcu_bh_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); - -/* - * Force a quiescent state for RCU BH. - */ -void rcu_bh_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_bh_state); -} -EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); - -/* - * Record the number of times rcutorture tests have been initiated and - * terminated. This information allows the debugfs tracing stats to be - * correlated to the rcutorture messages, even when the rcutorture module - * is being repeatedly loaded and unloaded. In other words, we cannot - * store this state in rcutorture itself. - */ -void rcutorture_record_test_transition(void) -{ - rcutorture_testseq++; - rcutorture_vernum = 0; -} -EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); - -/* - * Record the number of writer passes through the current rcutorture test. - * This is also used to correlate debugfs tracing stats with the rcutorture - * messages. - */ -void rcutorture_record_progress(unsigned long vernum) -{ - rcutorture_vernum++; -} -EXPORT_SYMBOL_GPL(rcutorture_record_progress); - -/* - * Force a quiescent state for RCU-sched. - */ -void rcu_sched_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_sched_state); -} -EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); - -/* - * Does the CPU have callbacks ready to be invoked? - */ -static int -cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) -{ - return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && - rdp->nxttail[RCU_DONE_TAIL] != NULL; -} - -/* - * Does the current CPU require a not-yet-started grace period? - * The caller must have disabled interrupts to prevent races with - * normal callback registry. - */ -static int -cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) -{ - int i; - - if (rcu_gp_in_progress(rsp)) - return 0; /* No, a grace period is already in progress. */ - if (rcu_nocb_needs_gp(rsp)) - return 1; /* Yes, a no-CBs CPU needs one. */ - if (!rdp->nxttail[RCU_NEXT_TAIL]) - return 0; /* No, this is a no-CBs (or offline) CPU. */ - if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) - return 1; /* Yes, this CPU has newly registered callbacks. */ - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) - if (rdp->nxttail[i - 1] != rdp->nxttail[i] && - ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), - rdp->nxtcompleted[i])) - return 1; /* Yes, CBs for future grace period. */ - return 0; /* No grace period needed. */ -} - -/* - * Return the root node of the specified rcu_state structure. - */ -static struct rcu_node *rcu_get_root(struct rcu_state *rsp) -{ - return &rsp->node[0]; -} - -/* - * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state - * - * If the new value of the ->dynticks_nesting counter now is zero, - * we really have entered idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. - */ -static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, - bool user) -{ - trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); - if (!user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); - - trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); - ftrace_dump(DUMP_ORIG); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } - rcu_prepare_for_idle(smp_processor_id()); - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); - - /* - * It is illegal to enter an extended quiescent state while - * in an RCU read-side critical section. - */ - rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), - "Illegal idle entry in RCU read-side critical section."); - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), - "Illegal idle entry in RCU-bh read-side critical section."); - rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), - "Illegal idle entry in RCU-sched read-side critical section."); -} - -/* - * Enter an RCU extended quiescent state, which can be either the - * idle loop or adaptive-tickless usermode execution. - */ -static void rcu_eqs_enter(bool user) -{ - long long oldval; - struct rcu_dynticks *rdtp; - - rdtp = this_cpu_ptr(&rcu_dynticks); - oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); - if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) - rdtp->dynticks_nesting = 0; - else - rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_eqs_enter_common(rdtp, oldval, user); -} - -/** - * rcu_idle_enter - inform RCU that current CPU is entering idle - * - * Enter idle mode, in other words, -leave- the mode in which RCU - * read-side critical sections can occur. (Though RCU read-side - * critical sections can occur in irq handlers in idle, a possibility - * handled by irq_enter() and irq_exit().) - * - * We crowbar the ->dynticks_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. - */ -void rcu_idle_enter(void) -{ - unsigned long flags; - - local_irq_save(flags); - rcu_eqs_enter(false); - rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(rcu_idle_enter); - -#ifdef CONFIG_RCU_USER_QS -/** - * rcu_user_enter - inform RCU that we are resuming userspace. - * - * Enter RCU idle mode right before resuming userspace. No use of RCU - * is permitted between this call and rcu_user_exit(). This way the - * CPU doesn't need to maintain the tick for RCU maintenance purposes - * when the CPU runs in userspace. - */ -void rcu_user_enter(void) -{ - rcu_eqs_enter(1); -} -#endif /* CONFIG_RCU_USER_QS */ - -/** - * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle - * - * Exit from an interrupt handler, which might possibly result in entering - * idle mode, in other words, leaving the mode in which read-side critical - * sections can occur. - * - * This code assumes that the idle loop never does anything that might - * result in unbalanced calls to irq_enter() and irq_exit(). If your - * architecture violates this assumption, RCU will give you what you - * deserve, good and hard. But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. - */ -void rcu_irq_exit(void) -{ - unsigned long flags; - long long oldval; - struct rcu_dynticks *rdtp; - - local_irq_save(flags); - rdtp = this_cpu_ptr(&rcu_dynticks); - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting--; - WARN_ON_ONCE(rdtp->dynticks_nesting < 0); - if (rdtp->dynticks_nesting) - trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); - else - rcu_eqs_enter_common(rdtp, oldval, true); - rcu_sysidle_enter(rdtp, 1); - local_irq_restore(flags); -} - -/* - * rcu_eqs_exit_common - current CPU moving away from extended quiescent state - * - * If the new value of the ->dynticks_nesting counter was previously zero, - * we really have exited idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. - */ -static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, - int user) -{ - smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic_inc(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); - rcu_cleanup_after_idle(smp_processor_id()); - trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); - if (!user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); - - trace_rcu_dyntick(TPS("Error on exit: not idle task"), - oldval, rdtp->dynticks_nesting); - ftrace_dump(DUMP_ORIG); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - -/* - * Exit an RCU extended quiescent state, which can be either the - * idle loop or adaptive-tickless usermode execution. - */ -static void rcu_eqs_exit(bool user) -{ - struct rcu_dynticks *rdtp; - long long oldval; - - rdtp = this_cpu_ptr(&rcu_dynticks); - oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE(oldval < 0); - if (oldval & DYNTICK_TASK_NEST_MASK) - rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else - rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(rdtp, oldval, user); -} - -/** - * rcu_idle_exit - inform RCU that current CPU is leaving idle - * - * Exit idle mode, in other words, -enter- the mode in which RCU - * read-side critical sections can occur. - * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to - * allow for the possibility of usermode upcalls messing up our count - * of interrupt nesting level during the busy period that is just - * now starting. - */ -void rcu_idle_exit(void) -{ - unsigned long flags; - - local_irq_save(flags); - rcu_eqs_exit(false); - rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(rcu_idle_exit); - -#ifdef CONFIG_RCU_USER_QS -/** - * rcu_user_exit - inform RCU that we are exiting userspace. - * - * Exit RCU idle mode while entering the kernel because it can - * run a RCU read side critical section anytime. - */ -void rcu_user_exit(void) -{ - rcu_eqs_exit(1); -} -#endif /* CONFIG_RCU_USER_QS */ - -/** - * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle - * - * Enter an interrupt handler, which might possibly result in exiting - * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. - * - * Note that the Linux kernel is fully capable of entering an interrupt - * handler that it never exits, for example when doing upcalls to - * user mode! This code assumes that the idle loop never does upcalls to - * user mode. If your architecture does do upcalls from the idle loop (or - * does anything else that results in unbalanced calls to the irq_enter() - * and irq_exit() functions), RCU will give you what you deserve, good - * and hard. But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. - */ -void rcu_irq_enter(void) -{ - unsigned long flags; - struct rcu_dynticks *rdtp; - long long oldval; - - local_irq_save(flags); - rdtp = this_cpu_ptr(&rcu_dynticks); - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting++; - WARN_ON_ONCE(rdtp->dynticks_nesting == 0); - if (oldval) - trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); - else - rcu_eqs_exit_common(rdtp, oldval, true); - rcu_sysidle_exit(rdtp, 1); - local_irq_restore(flags); -} - -/** - * rcu_nmi_enter - inform RCU of entry to NMI context - * - * If the CPU was idle with dynamic ticks active, and there is no - * irq handler running, this updates rdtp->dynticks_nmi to let the - * RCU grace-period handling know that the CPU is active. - */ -void rcu_nmi_enter(void) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - if (rdtp->dynticks_nmi_nesting == 0 && - (atomic_read(&rdtp->dynticks) & 0x1)) - return; - rdtp->dynticks_nmi_nesting++; - smp_mb__before_atomic_inc(); /* Force delay from prior write. */ - atomic_inc(&rdtp->dynticks); - /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ - smp_mb__after_atomic_inc(); /* See above. */ - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); -} - -/** - * rcu_nmi_exit - inform RCU of exit from NMI context - * - * If the CPU was idle with dynamic ticks active, and there is no - * irq handler running, this updates rdtp->dynticks_nmi to let the - * RCU grace-period handling know that the CPU is no longer active. - */ -void rcu_nmi_exit(void) -{ - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - if (rdtp->dynticks_nmi_nesting == 0 || - --rdtp->dynticks_nmi_nesting != 0) - return; - /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ - smp_mb__before_atomic_inc(); /* See above. */ - atomic_inc(&rdtp->dynticks); - smp_mb__after_atomic_inc(); /* Force delay to next write. */ - WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); -} - -/** - * __rcu_is_watching - are RCU read-side critical sections safe? - * - * Return true if RCU is watching the running CPU, which means that - * this CPU can safely enter RCU read-side critical sections. Unlike - * rcu_is_watching(), the caller of __rcu_is_watching() must have at - * least disabled preemption. - */ -bool __rcu_is_watching(void) -{ - return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; -} - -/** - * rcu_is_watching - see if RCU thinks that the current CPU is idle - * - * If the current CPU is in its idle loop and is neither in an interrupt - * or NMI handler, return true. - */ -bool rcu_is_watching(void) -{ - int ret; - - preempt_disable(); - ret = __rcu_is_watching(); - preempt_enable(); - return ret; -} -EXPORT_SYMBOL_GPL(rcu_is_watching); - -#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) - -/* - * Is the current CPU online? Disable preemption to avoid false positives - * that could otherwise happen due to the current CPU number being sampled, - * this task being preempted, its old CPU being taken offline, resuming - * on some other CPU, then determining that its old CPU is now offline. - * It is OK to use RCU on an offline processor during initial boot, hence - * the check for rcu_scheduler_fully_active. Note also that it is OK - * for a CPU coming online to use RCU for one jiffy prior to marking itself - * online in the cpu_online_mask. Similarly, it is OK for a CPU going - * offline to continue to use RCU for one jiffy after marking itself - * offline in the cpu_online_mask. This leniency is necessary given the - * non-atomic nature of the online and offline processing, for example, - * the fact that a CPU enters the scheduler after completing the CPU_DYING - * notifiers. - * - * This is also why RCU internally marks CPUs online during the - * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. - * - * Disable checking if in an NMI handler because we cannot safely report - * errors from NMI handlers anyway. - */ -bool rcu_lockdep_current_cpu_online(void) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - bool ret; - - if (in_nmi()) - return 1; - preempt_disable(); - rdp = this_cpu_ptr(&rcu_sched_data); - rnp = rdp->mynode; - ret = (rdp->grpmask & rnp->qsmaskinit) || - !rcu_scheduler_fully_active; - preempt_enable(); - return ret; -} -EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); - -#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ - -/** - * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle - * - * If the current CPU is idle or running at a first-level (not nested) - * interrupt from idle, return true. The caller must have at least - * disabled preemption. - */ -static int rcu_is_cpu_rrupt_from_idle(void) -{ - return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; -} - -/* - * Snapshot the specified CPU's dynticks counter so that we can later - * credit them with an implicit quiescent state. Return 1 if this CPU - * is in dynticks idle mode, which is an extended quiescent state. - */ -static int dyntick_save_progress_counter(struct rcu_data *rdp, - bool *isidle, unsigned long *maxj) -{ - rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); - rcu_sysidle_check_cpu(rdp, isidle, maxj); - return (rdp->dynticks_snap & 0x1) == 0; -} - -/* - * Return true if the specified CPU has passed through a quiescent - * state by virtue of being in or having passed through an dynticks - * idle state since the last call to dyntick_save_progress_counter() - * for this same CPU, or by virtue of having been offline. - */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, - bool *isidle, unsigned long *maxj) -{ - unsigned int curr; - unsigned int snap; - - curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); - snap = (unsigned int)rdp->dynticks_snap; - - /* - * If the CPU passed through or entered a dynticks idle phase with - * no active irq/NMI handlers, then we can safely pretend that the CPU - * already acknowledged the request to pass through a quiescent - * state. Either way, that CPU cannot possibly be in an RCU - * read-side critical section that started before the beginning - * of the current RCU grace period. - */ - if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); - rdp->dynticks_fqs++; - return 1; - } - - /* - * Check for the CPU being offline, but only if the grace period - * is old enough. We don't need to worry about the CPU changing - * state: If we see it offline even once, it has been through a - * quiescent state. - * - * The reason for insisting that the grace period be at least - * one jiffy old is that CPUs that are not quite online and that - * have just gone offline can still execute RCU read-side critical - * sections. - */ - if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) - return 0; /* Grace period is not old enough. */ - barrier(); - if (cpu_is_offline(rdp->cpu)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); - rdp->offline_fqs++; - return 1; - } - - /* - * There is a possibility that a CPU in adaptive-ticks state - * might run in the kernel with the scheduling-clock tick disabled - * for an extended time period. Invoke rcu_kick_nohz_cpu() to - * force the CPU to restart the scheduling-clock tick in this - * CPU is in this state. - */ - rcu_kick_nohz_cpu(rdp->cpu); - - return 0; -} - -static void record_gp_stall_check_time(struct rcu_state *rsp) -{ - unsigned long j = ACCESS_ONCE(jiffies); - - rsp->gp_start = j; - smp_wmb(); /* Record start time before stall time. */ - rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); -} - -/* - * Dump stacks of all tasks running on stalled CPUs. This is a fallback - * for architectures that do not implement trigger_all_cpu_backtrace(). - * The NMI-triggered stack traces are more accurate because they are - * printed by the target CPU. - */ -static void rcu_dump_cpu_stacks(struct rcu_state *rsp) -{ - int cpu; - unsigned long flags; - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) - dump_cpu_task(rnp->grplo + cpu); - } - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } -} - -static void print_other_cpu_stall(struct rcu_state *rsp) -{ - int cpu; - long delta; - unsigned long flags; - int ndetected = 0; - struct rcu_node *rnp = rcu_get_root(rsp); - long totqlen = 0; - - /* Only let one CPU complain about others per time interval. */ - - raw_spin_lock_irqsave(&rnp->lock, flags); - delta = jiffies - rsp->jiffies_stall; - if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - - /* - * OK, time to rat on our buddy... - * See Documentation/RCU/stallwarn.txt for info on how to debug - * RCU CPU stall warnings. - */ - pr_err("INFO: %s detected stalls on CPUs/tasks:", - rsp->name); - print_cpu_stall_info_begin(); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - ndetected += rcu_print_task_stall(rnp); - if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - print_cpu_stall_info(rsp, - rnp->grplo + cpu); - ndetected++; - } - } - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } - - /* - * Now rat on any tasks that got kicked up to the root rcu_node - * due to CPU offlining. - */ - rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); - ndetected += rcu_print_task_stall(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - - print_cpu_stall_info_end(); - for_each_possible_cpu(cpu) - totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; - pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rsp->gp_start), - rsp->gpnum, rsp->completed, totqlen); - if (ndetected == 0) - pr_err("INFO: Stall ended before state dump start\n"); - else if (!trigger_all_cpu_backtrace()) - rcu_dump_cpu_stacks(rsp); - - /* Complain about tasks blocking the grace period. */ - - rcu_print_detail_task_stall(rsp); - - force_quiescent_state(rsp); /* Kick them all. */ -} - -static void print_cpu_stall(struct rcu_state *rsp) -{ - int cpu; - unsigned long flags; - struct rcu_node *rnp = rcu_get_root(rsp); - long totqlen = 0; - - /* - * OK, time to rat on ourselves... - * See Documentation/RCU/stallwarn.txt for info on how to debug - * RCU CPU stall warnings. - */ - pr_err("INFO: %s self-detected stall on CPU", rsp->name); - print_cpu_stall_info_begin(); - print_cpu_stall_info(rsp, smp_processor_id()); - print_cpu_stall_info_end(); - for_each_possible_cpu(cpu) - totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; - pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", - jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); - if (!trigger_all_cpu_backtrace()) - dump_stack(); - - raw_spin_lock_irqsave(&rnp->lock, flags); - if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) - rsp->jiffies_stall = jiffies + - 3 * rcu_jiffies_till_stall_check() + 3; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - - set_need_resched(); /* kick ourselves to get things going. */ -} - -static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long completed; - unsigned long gpnum; - unsigned long gps; - unsigned long j; - unsigned long js; - struct rcu_node *rnp; - - if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) - return; - j = ACCESS_ONCE(jiffies); - - /* - * Lots of memory barriers to reject false positives. - * - * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, - * then rsp->gp_start, and finally rsp->completed. These values - * are updated in the opposite order with memory barriers (or - * equivalent) during grace-period initialization and cleanup. - * Now, a false positive can occur if we get an new value of - * rsp->gp_start and a old value of rsp->jiffies_stall. But given - * the memory barriers, the only way that this can happen is if one - * grace period ends and another starts between these two fetches. - * Detect this by comparing rsp->completed with the previous fetch - * from rsp->gpnum. - * - * Given this check, comparisons of jiffies, rsp->jiffies_stall, - * and rsp->gp_start suffice to forestall false positives. - */ - gpnum = ACCESS_ONCE(rsp->gpnum); - smp_rmb(); /* Pick up ->gpnum first... */ - js = ACCESS_ONCE(rsp->jiffies_stall); - smp_rmb(); /* ...then ->jiffies_stall before the rest... */ - gps = ACCESS_ONCE(rsp->gp_start); - smp_rmb(); /* ...and finally ->gp_start before ->completed. */ - completed = ACCESS_ONCE(rsp->completed); - if (ULONG_CMP_GE(completed, gpnum) || - ULONG_CMP_LT(j, js) || - ULONG_CMP_GE(gps, js)) - return; /* No stall or GP completed since entering function. */ - rnp = rdp->mynode; - if (rcu_gp_in_progress(rsp) && - (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { - - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(rsp); - - } else if (rcu_gp_in_progress(rsp) && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { - - /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(rsp); - } -} - -/** - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period - * - * Set the stall-warning timeout way off into the future, thus preventing - * any RCU CPU stall-warning messages from appearing in the current set of - * RCU grace periods. - * - * The caller must disable hard irqs. - */ -void rcu_cpu_stall_reset(void) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - rsp->jiffies_stall = jiffies + ULONG_MAX / 2; -} - -/* - * Initialize the specified rcu_data structure's callback list to empty. - */ -static void init_callback_list(struct rcu_data *rdp) -{ - int i; - - if (init_nocb_callback_list(rdp)) - return; - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; -} - -/* - * Determine the value that ->completed will have at the end of the - * next subsequent grace period. This is used to tag callbacks so that - * a CPU can invoke callbacks in a timely fashion even if that CPU has - * been dyntick-idle for an extended period with callbacks under the - * influence of RCU_FAST_NO_HZ. - * - * The caller must hold rnp->lock with interrupts disabled. - */ -static unsigned long rcu_cbs_completed(struct rcu_state *rsp, - struct rcu_node *rnp) -{ - /* - * If RCU is idle, we just wait for the next grace period. - * But we can only be sure that RCU is idle if we are looking - * at the root rcu_node structure -- otherwise, a new grace - * period might have started, but just not yet gotten around - * to initializing the current non-root rcu_node structure. - */ - if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) - return rnp->completed + 1; - - /* - * Otherwise, wait for a possible partial grace period and - * then the subsequent full grace period. - */ - return rnp->completed + 2; -} - -/* - * Trace-event helper function for rcu_start_future_gp() and - * rcu_nocb_wait_gp(). - */ -static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, const char *s) -{ - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, - rnp->completed, c, rnp->level, - rnp->grplo, rnp->grphi, s); -} - -/* - * Start some future grace period, as needed to handle newly arrived - * callbacks. The required future grace periods are recorded in each - * rcu_node structure's ->need_future_gp field. - * - * The caller must hold the specified rcu_node structure's ->lock. - */ -static unsigned long __maybe_unused -rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) -{ - unsigned long c; - int i; - struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); - - /* - * Pick up grace-period number for new callbacks. If this - * grace period is already marked as needed, return to the caller. - */ - c = rcu_cbs_completed(rdp->rsp, rnp); - trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); - if (rnp->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); - return c; - } - - /* - * If either this rcu_node structure or the root rcu_node structure - * believe that a grace period is in progress, then we must wait - * for the one following, which is in "c". Because our request - * will be noticed at the end of the current grace period, we don't - * need to explicitly start one. - */ - if (rnp->gpnum != rnp->completed || - ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { - rnp->need_future_gp[c & 0x1]++; - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); - return c; - } - - /* - * There might be no grace period in progress. If we don't already - * hold it, acquire the root rcu_node structure's lock in order to - * start one (if needed). - */ - if (rnp != rnp_root) - raw_spin_lock(&rnp_root->lock); - - /* - * Get a new grace-period number. If there really is no grace - * period in progress, it will be smaller than the one we obtained - * earlier. Adjust callbacks as needed. Note that even no-CBs - * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. - */ - c = rcu_cbs_completed(rdp->rsp, rnp_root); - for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) - if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) - rdp->nxtcompleted[i] = c; - - /* - * If the needed for the required grace period is already - * recorded, trace and leave. - */ - if (rnp_root->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); - goto unlock_out; - } - - /* Record the need for the future grace period. */ - rnp_root->need_future_gp[c & 0x1]++; - - /* If a grace period is not already in progress, start one. */ - if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); - } else { - trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); - rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); - } -unlock_out: - if (rnp != rnp_root) - raw_spin_unlock(&rnp_root->lock); - return c; -} - -/* - * Clean up any old requests for the just-ended grace period. Also return - * whether any additional grace periods have been requested. Also invoke - * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads - * waiting for this grace period to complete. - */ -static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) -{ - int c = rnp->completed; - int needmore; - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - - rcu_nocb_gp_cleanup(rsp, rnp); - rnp->need_future_gp[c & 0x1] = 0; - needmore = rnp->need_future_gp[(c + 1) & 0x1]; - trace_rcu_future_gp(rnp, rdp, c, - needmore ? TPS("CleanupMore") : TPS("Cleanup")); - return needmore; -} - -/* - * If there is room, assign a ->completed number to any callbacks on - * this CPU that have not already been assigned. Also accelerate any - * callbacks that were previously assigned a ->completed number that has - * since proven to be too conservative, which can happen if callbacks get - * assigned a ->completed number while RCU is idle, but with reference to - * a non-root rcu_node structure. This function is idempotent, so it does - * not hurt to call it repeatedly. - * - * The caller must hold rnp->lock with interrupts disabled. - */ -static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) -{ - unsigned long c; - int i; - - /* If the CPU has no callbacks, nothing to do. */ - if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return; - - /* - * Starting from the sublist containing the callbacks most - * recently assigned a ->completed number and working down, find the - * first sublist that is not assignable to an upcoming grace period. - * Such a sublist has something in it (first two tests) and has - * a ->completed number assigned that will complete sooner than - * the ->completed number for newly arrived callbacks (last test). - * - * The key point is that any later sublist can be assigned the - * same ->completed number as the newly arrived callbacks, which - * means that the callbacks in any of these later sublist can be - * grouped into a single sublist, whether or not they have already - * been assigned a ->completed number. - */ - c = rcu_cbs_completed(rsp, rnp); - for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) - if (rdp->nxttail[i] != rdp->nxttail[i - 1] && - !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) - break; - - /* - * If there are no sublist for unassigned callbacks, leave. - * At the same time, advance "i" one sublist, so that "i" will - * index into the sublist where all the remaining callbacks should - * be grouped into. - */ - if (++i >= RCU_NEXT_TAIL) - return; - - /* - * Assign all subsequent callbacks' ->completed number to the next - * full grace period and group them all in the sublist initially - * indexed by "i". - */ - for (; i <= RCU_NEXT_TAIL; i++) { - rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxtcompleted[i] = c; - } - /* Record any needed additional grace periods. */ - rcu_start_future_gp(rnp, rdp); - - /* Trace depending on how much we were able to accelerate. */ - if (!*rdp->nxttail[RCU_WAIT_TAIL]) - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); - else - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); -} - -/* - * Move any callbacks whose grace period has completed to the - * RCU_DONE_TAIL sublist, then compact the remaining sublists and - * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL - * sublist. This function is idempotent, so it does not hurt to - * invoke it repeatedly. As long as it is not invoked -too- often... - * - * The caller must hold rnp->lock with interrupts disabled. - */ -static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) -{ - int i, j; - - /* If the CPU has no callbacks, nothing to do. */ - if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return; - - /* - * Find all callbacks whose ->completed numbers indicate that they - * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. - */ - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { - if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) - break; - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; - } - /* Clean up any sublist tail pointers that were misordered above. */ - for (j = RCU_WAIT_TAIL; j < i; j++) - rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; - - /* Copy down callbacks to fill in empty sublists. */ - for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { - if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) - break; - rdp->nxttail[j] = rdp->nxttail[i]; - rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; - } - - /* Classify any remaining callbacks. */ - rcu_accelerate_cbs(rsp, rnp, rdp); -} - -/* - * Update CPU-local rcu_data state to record the beginnings and ends of - * grace periods. The caller must hold the ->lock of the leaf rcu_node - * structure corresponding to the current CPU, and must have irqs disabled. - */ -static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) -{ - /* Handle the ends of any preceding grace periods first. */ - if (rdp->completed == rnp->completed) { - - /* No grace period end, so just accelerate recent callbacks. */ - rcu_accelerate_cbs(rsp, rnp, rdp); - - } else { - - /* Advance callbacks. */ - rcu_advance_cbs(rsp, rnp, rdp); - - /* Remember that we saw this grace-period completion. */ - rdp->completed = rnp->completed; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); - } - - if (rdp->gpnum != rnp->gpnum) { - /* - * If the current grace period is waiting for this CPU, - * set up to detect a quiescent state, otherwise don't - * go looking for one. - */ - rdp->gpnum = rnp->gpnum; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); - rdp->passed_quiesce = 0; - rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); - zero_cpu_stall_ticks(rdp); - } -} - -static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_node *rnp; - - local_irq_save(flags); - rnp = rdp->mynode; - if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && - rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ - local_irq_restore(flags); - return; - } - __note_gp_changes(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Initialize a new grace period. Return 0 if no grace period required. - */ -static int rcu_gp_init(struct rcu_state *rsp) -{ - struct rcu_data *rdp; - struct rcu_node *rnp = rcu_get_root(rsp); - - rcu_bind_gp_kthread(); - raw_spin_lock_irq(&rnp->lock); - if (rsp->gp_flags == 0) { - /* Spurious wakeup, tell caller to go back to sleep. */ - raw_spin_unlock_irq(&rnp->lock); - return 0; - } - rsp->gp_flags = 0; /* Clear all flags: New grace period. */ - - if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { - /* - * Grace period already in progress, don't start another. - * Not supposed to be able to happen. - */ - raw_spin_unlock_irq(&rnp->lock); - return 0; - } - - /* Advance to a new grace period and initialize state. */ - record_gp_stall_check_time(rsp); - smp_wmb(); /* Record GP times before starting GP. */ - rsp->gpnum++; - trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); - raw_spin_unlock_irq(&rnp->lock); - - /* Exclude any concurrent CPU-hotplug operations. */ - mutex_lock(&rsp->onoff_mutex); - - /* - * Set the quiescent-state-needed bits in all the rcu_node - * structures for all currently online CPUs in breadth-first order, - * starting from the root rcu_node structure, relying on the layout - * of the tree within the rsp->node[] array. Note that other CPUs - * will access only the leaves of the hierarchy, thus seeing that no - * grace period is in progress, at least until the corresponding - * leaf node has been initialized. In addition, we have excluded - * CPU-hotplug operations. - * - * The grace period cannot complete until the initialization - * process finishes, because this kthread handles both. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - rdp = this_cpu_ptr(rsp->rda); - rcu_preempt_check_blocked_tasks(rnp); - rnp->qsmask = rnp->qsmaskinit; - ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; - WARN_ON_ONCE(rnp->completed != rsp->completed); - ACCESS_ONCE(rnp->completed) = rsp->completed; - if (rnp == rdp->mynode) - __note_gp_changes(rsp, rnp, rdp); - rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, - rnp->level, rnp->grplo, - rnp->grphi, rnp->qsmask); - raw_spin_unlock_irq(&rnp->lock); -#ifdef CONFIG_PROVE_RCU_DELAY - if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && - system_state == SYSTEM_RUNNING) - udelay(200); -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ - cond_resched(); - } - - mutex_unlock(&rsp->onoff_mutex); - return 1; -} - -/* - * Do one round of quiescent-state forcing. - */ -static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) -{ - int fqs_state = fqs_state_in; - bool isidle = false; - unsigned long maxj; - struct rcu_node *rnp = rcu_get_root(rsp); - - rsp->n_force_qs++; - if (fqs_state == RCU_SAVE_DYNTICK) { - /* Collect dyntick-idle snapshots. */ - if (is_sysidle_rcu_state(rsp)) { - isidle = 1; - maxj = jiffies - ULONG_MAX / 4; - } - force_qs_rnp(rsp, dyntick_save_progress_counter, - &isidle, &maxj); - rcu_sysidle_report_gp(rsp, isidle, maxj); - fqs_state = RCU_FORCE_QS; - } else { - /* Handle dyntick-idle and offline CPUs. */ - isidle = 0; - force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); - } - /* Clear flag to prevent immediate re-entry. */ - if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - raw_spin_lock_irq(&rnp->lock); - rsp->gp_flags &= ~RCU_GP_FLAG_FQS; - raw_spin_unlock_irq(&rnp->lock); - } - return fqs_state; -} - -/* - * Clean up after the old grace period. - */ -static void rcu_gp_cleanup(struct rcu_state *rsp) -{ - unsigned long gp_duration; - int nocb = 0; - struct rcu_data *rdp; - struct rcu_node *rnp = rcu_get_root(rsp); - - raw_spin_lock_irq(&rnp->lock); - gp_duration = jiffies - rsp->gp_start; - if (gp_duration > rsp->gp_max) - rsp->gp_max = gp_duration; - - /* - * We know the grace period is complete, but to everyone else - * it appears to still be ongoing. But it is also the case - * that to everyone else it looks like there is nothing that - * they can do to advance the grace period. It is therefore - * safe for us to drop the lock in order to mark the grace - * period as completed in all of the rcu_node structures. - */ - raw_spin_unlock_irq(&rnp->lock); - - /* - * Propagate new ->completed value to rcu_node structures so - * that other CPUs don't have to wait until the start of the next - * grace period to process their callbacks. This also avoids - * some nasty RCU grace-period initialization races by forcing - * the end of the current grace period to be completely recorded in - * all of the rcu_node structures before the beginning of the next - * grace period is recorded in any of the rcu_node structures. - */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - ACCESS_ONCE(rnp->completed) = rsp->gpnum; - rdp = this_cpu_ptr(rsp->rda); - if (rnp == rdp->mynode) - __note_gp_changes(rsp, rnp, rdp); - nocb += rcu_future_gp_cleanup(rsp, rnp); - raw_spin_unlock_irq(&rnp->lock); - cond_resched(); - } - rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - rcu_nocb_gp_set(rnp, nocb); - - rsp->completed = rsp->gpnum; /* Declare grace period done. */ - trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); - rsp->fqs_state = RCU_GP_IDLE; - rdp = this_cpu_ptr(rsp->rda); - rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ - if (cpu_needs_another_gp(rsp, rdp)) { - rsp->gp_flags = RCU_GP_FLAG_INIT; - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("newreq")); - } - raw_spin_unlock_irq(&rnp->lock); -} - -/* - * Body of kthread that handles grace periods. - */ -static int __noreturn rcu_gp_kthread(void *arg) -{ - int fqs_state; - int gf; - unsigned long j; - int ret; - struct rcu_state *rsp = arg; - struct rcu_node *rnp = rcu_get_root(rsp); - - for (;;) { - - /* Handle grace-period start. */ - for (;;) { - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("reqwait")); - wait_event_interruptible(rsp->gp_wq, - ACCESS_ONCE(rsp->gp_flags) & - RCU_GP_FLAG_INIT); - if (rcu_gp_init(rsp)) - break; - cond_resched(); - flush_signals(current); - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("reqwaitsig")); - } - - /* Handle quiescent-state forcing. */ - fqs_state = RCU_SAVE_DYNTICK; - j = jiffies_till_first_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_first_fqs = HZ; - } - ret = 0; - for (;;) { - if (!ret) - rsp->jiffies_force_qs = jiffies + j; - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("fqswait")); - ret = wait_event_interruptible_timeout(rsp->gp_wq, - ((gf = ACCESS_ONCE(rsp->gp_flags)) & - RCU_GP_FLAG_FQS) || - (!ACCESS_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)), - j); - /* If grace period done, leave loop. */ - if (!ACCESS_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)) - break; - /* If time for quiescent-state forcing, do it. */ - if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || - (gf & RCU_GP_FLAG_FQS)) { - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("fqsstart")); - fqs_state = rcu_gp_fqs(rsp, fqs_state); - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("fqsend")); - cond_resched(); - } else { - /* Deal with stray signal. */ - cond_resched(); - flush_signals(current); - trace_rcu_grace_period(rsp->name, - ACCESS_ONCE(rsp->gpnum), - TPS("fqswaitsig")); - } - j = jiffies_till_next_fqs; - if (j > HZ) { - j = HZ; - jiffies_till_next_fqs = HZ; - } else if (j < 1) { - j = 1; - jiffies_till_next_fqs = 1; - } - } - - /* Handle grace-period end. */ - rcu_gp_cleanup(rsp); - } -} - -static void rsp_wakeup(struct irq_work *work) -{ - struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); - - /* Wake up rcu_gp_kthread() to start the grace period. */ - wake_up(&rsp->gp_wq); -} - -/* - * Start a new RCU grace period if warranted, re-initializing the hierarchy - * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock and hard irqs must be disabled. - * - * Note that it is legal for a dying CPU (which is marked as offline) to - * invoke this function. This can happen when the dying CPU reports its - * quiescent state. - */ -static void -rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, - struct rcu_data *rdp) -{ - if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { - /* - * Either we have not yet spawned the grace-period - * task, this CPU does not need another grace period, - * or a grace period is already in progress. - * Either way, don't start a new grace period. - */ - return; - } - rsp->gp_flags = RCU_GP_FLAG_INIT; - trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), - TPS("newreq")); - - /* - * We can't do wakeups while holding the rnp->lock, as that - * could cause possible deadlocks with the rq->lock. Defer - * the wakeup to interrupt context. And don't bother waking - * up the running kthread. - */ - if (current != rsp->gp_kthread) - irq_work_queue(&rsp->wakeup_work); -} - -/* - * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's - * callbacks. Note that rcu_start_gp_advanced() cannot do this because it - * is invoked indirectly from rcu_advance_cbs(), which would result in - * endless recursion -- or would do so if it wasn't for the self-deadlock - * that is encountered beforehand. - */ -static void -rcu_start_gp(struct rcu_state *rsp) -{ - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - - /* - * If there is no grace period in progress right now, any - * callbacks we have up to this point will be satisfied by the - * next grace period. Also, advancing the callbacks reduces the - * probability of false positives from cpu_needs_another_gp() - * resulting in pointless grace periods. So, advance callbacks - * then start the grace period! - */ - rcu_advance_cbs(rsp, rnp, rdp); - rcu_start_gp_advanced(rsp, rnp, rdp); -} - -/* - * Report a full set of quiescent states to the specified rcu_state - * data structure. This involves cleaning up after the prior grace - * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, which - * is released before return. - */ -static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) -{ - WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ -} - -/* - * Similar to rcu_report_qs_rdp(), for which it is a helper function. - * Allows quiescent states for a group of CPUs to be reported at one go - * to the specified rcu_node structure, though all the CPUs in the group - * must be represented by the same rcu_node structure (which need not be - * a leaf rcu_node structure, though it often will be). That structure's - * lock must be held upon entry, and it is released before return. - */ -static void -rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags) - __releases(rnp->lock) -{ - struct rcu_node *rnp_c; - - /* Walk up the rcu_node hierarchy. */ - for (;;) { - if (!(rnp->qsmask & mask)) { - - /* Our bit has already been cleared, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - rnp->qsmask &= ~mask; - trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, - mask, rnp->qsmask, rnp->level, - rnp->grplo, rnp->grphi, - !!rnp->gp_tasks); - if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { - - /* Other bits still set at this level, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - mask = rnp->grpmask; - if (rnp->parent == NULL) { - - /* No more levels. Exit loop holding root lock. */ - - break; - } - raw_spin_unlock_irqrestore(&rnp->lock, flags); - rnp_c = rnp; - rnp = rnp->parent; - raw_spin_lock_irqsave(&rnp->lock, flags); - WARN_ON_ONCE(rnp_c->qsmask); - } - - /* - * Get here if we are the last CPU to pass through a quiescent - * state for this grace period. Invoke rcu_report_qs_rsp() - * to clean up and start the next grace period if one is needed. - */ - rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ -} - -/* - * Record a quiescent state for the specified CPU to that CPU's rcu_data - * structure. This must be either called from the specified CPU, or - * called when the specified CPU is known to be offline (and when it is - * also known that no other CPU is concurrently trying to help the offline - * CPU). The lastcomp argument is used to make sure we are still in the - * grace period of interest. We don't want to end the current grace period - * based on quiescent states detected in an earlier grace period! - */ -static void -rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp; - - rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || - rnp->completed == rnp->gpnum) { - - /* - * The grace period in which this quiescent state was - * recorded has ended, so don't report it upwards. - * We will instead need a new quiescent state that lies - * within the current grace period. - */ - rdp->passed_quiesce = 0; /* need qs for new gp. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - mask = rdp->grpmask; - if ((rnp->qsmask & mask) == 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } else { - rdp->qs_pending = 0; - - /* - * This GP can't end until cpu checks in, so all of our - * callbacks can be processed during the next GP. - */ - rcu_accelerate_cbs(rsp, rnp, rdp); - - rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ - } -} - -/* - * Check to see if there is a new grace period of which this CPU - * is not yet aware, and if so, set up local rcu_data state for it. - * Otherwise, see if this CPU has just passed through its first - * quiescent state for this grace period, and record that fact if so. - */ -static void -rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) -{ - /* Check for grace-period ends and beginnings. */ - note_gp_changes(rsp, rdp); - - /* - * Does this CPU still need to do its part for current grace period? - * If no, return and let the other CPUs do their part as well. - */ - if (!rdp->qs_pending) - return; - - /* - * Was there a quiescent state since the beginning of the grace - * period? If no, then exit and wait for the next call. - */ - if (!rdp->passed_quiesce) - return; - - /* - * Tell RCU we are done (but rcu_report_qs_rdp() will be the - * judge of that). - */ - rcu_report_qs_rdp(rdp->cpu, rsp, rdp); -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Send the specified CPU's RCU callbacks to the orphanage. The - * specified CPU must be offline, and the caller must hold the - * ->orphan_lock. - */ -static void -rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, - struct rcu_node *rnp, struct rcu_data *rdp) -{ - /* No-CBs CPUs do not have orphanable callbacks. */ - if (rcu_is_nocb_cpu(rdp->cpu)) - return; - - /* - * Orphan the callbacks. First adjust the counts. This is safe - * because _rcu_barrier() excludes CPU-hotplug operations, so it - * cannot be running now. Thus no memory barrier is required. - */ - if (rdp->nxtlist != NULL) { - rsp->qlen_lazy += rdp->qlen_lazy; - rsp->qlen += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; - rdp->qlen_lazy = 0; - ACCESS_ONCE(rdp->qlen) = 0; - } - - /* - * Next, move those callbacks still needing a grace period to - * the orphanage, where some other CPU will pick them up. - * Some of the callbacks might have gone partway through a grace - * period, but that is too bad. They get to start over because we - * cannot assume that grace periods are synchronized across CPUs. - * We don't bother updating the ->nxttail[] array yet, instead - * we just reset the whole thing later on. - */ - if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { - *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; - rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = NULL; - } - - /* - * Then move the ready-to-invoke callbacks to the orphanage, - * where some other CPU will pick them up. These will not be - * required to pass though another grace period: They are done. - */ - if (rdp->nxtlist != NULL) { - *rsp->orphan_donetail = rdp->nxtlist; - rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; - } - - /* Finally, initialize the rcu_data structure's list to empty. */ - init_callback_list(rdp); -} - -/* - * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. The caller must hold the ->orphan_lock. - */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) -{ - int i; - struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - - /* No-CBs CPUs are handled specially. */ - if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) - return; - - /* Do the accounting first. */ - rdp->qlen_lazy += rsp->qlen_lazy; - rdp->qlen += rsp->qlen; - rdp->n_cbs_adopted += rsp->qlen; - if (rsp->qlen_lazy != rsp->qlen) - rcu_idle_count_callbacks_posted(); - rsp->qlen_lazy = 0; - rsp->qlen = 0; - - /* - * We do not need a memory barrier here because the only way we - * can get here if there is an rcu_barrier() in flight is if - * we are the task doing the rcu_barrier(). - */ - - /* First adopt the ready-to-invoke callbacks. */ - if (rsp->orphan_donelist != NULL) { - *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; - for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) - if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[i] = rsp->orphan_donetail; - rsp->orphan_donelist = NULL; - rsp->orphan_donetail = &rsp->orphan_donelist; - } - - /* And then adopt the callbacks that still need a grace period. */ - if (rsp->orphan_nxtlist != NULL) { - *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; - rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; - rsp->orphan_nxtlist = NULL; - rsp->orphan_nxttail = &rsp->orphan_nxtlist; - } -} - -/* - * Trace the fact that this CPU is going offline. - */ -static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) -{ - RCU_TRACE(unsigned long mask); - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); - RCU_TRACE(struct rcu_node *rnp = rdp->mynode); - - RCU_TRACE(mask = rdp->grpmask); - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - !!(rnp->qsmask & mask), - TPS("cpuofl")); -} - -/* - * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup, - * including orphaning the outgoing CPU's RCU callbacks, and also - * adopting them. There can only be one CPU hotplug operation at a time, - * so no other CPU can be attempting to update rcu_cpu_kthread_task. - */ -static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long mask; - int need_report = 0; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - - /* Adjust any no-longer-needed kthreads. */ - rcu_boost_kthread_setaffinity(rnp, -1); - - /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ - - /* Exclude any attempts to start a new grace period. */ - mutex_lock(&rsp->onoff_mutex); - raw_spin_lock_irqsave(&rsp->orphan_lock, flags); - - /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ - rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp); - - /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - mask = rdp->grpmask; /* rnp->grplo is constant. */ - do { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->qsmaskinit &= ~mask; - if (rnp->qsmaskinit != 0) { - if (rnp != rdp->mynode) - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - break; - } - if (rnp == rdp->mynode) - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - else - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - mask = rnp->grpmask; - rnp = rnp->parent; - } while (rnp != NULL); - - /* - * We still hold the leaf rcu_node structure lock here, and - * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock - * held leads to deadlock. - */ - raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ - rnp = rdp->mynode; - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); - WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, - "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", - cpu, rdp->qlen, rdp->nxtlist); - init_callback_list(rdp); - /* Disallow further callbacks on this CPU. */ - rdp->nxttail[RCU_NEXT_TAIL] = NULL; - mutex_unlock(&rsp->onoff_mutex); -} - -#else /* #ifdef CONFIG_HOTPLUG_CPU */ - -static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) -{ -} - -static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) -{ -} - -#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ - -/* - * Invoke any RCU callbacks that have made it to the end of their grace - * period. Thottle as specified by rdp->blimit. - */ -static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) -{ - unsigned long flags; - struct rcu_head *next, *list, **tail; - long bl, count, count_lazy; - int i; - - /* If no callbacks are ready, just return. */ - if (!cpu_has_callbacks_ready_to_invoke(rdp)) { - trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); - trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), - need_resched(), is_idle_task(current), - rcu_is_callbacks_kthread()); - return; - } - - /* - * Extract the list of ready callbacks, disabling to prevent - * races with call_rcu() from interrupt handlers. - */ - local_irq_save(flags); - WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); - bl = rdp->blimit; - trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); - list = rdp->nxtlist; - rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = NULL; - tail = rdp->nxttail[RCU_DONE_TAIL]; - for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) - if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[i] = &rdp->nxtlist; - local_irq_restore(flags); - - /* Invoke callbacks. */ - count = count_lazy = 0; - while (list) { - next = list->next; - prefetch(next); - debug_rcu_head_unqueue(list); - if (__rcu_reclaim(rsp->name, list)) - count_lazy++; - list = next; - /* Stop only if limit reached and CPU has something to do. */ - if (++count >= bl && - (need_resched() || - (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) - break; - } - - local_irq_save(flags); - trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), - is_idle_task(current), - rcu_is_callbacks_kthread()); - - /* Update count, and requeue any remaining callbacks. */ - if (list != NULL) { - *tail = rdp->nxtlist; - rdp->nxtlist = list; - for (i = 0; i < RCU_NEXT_SIZE; i++) - if (&rdp->nxtlist == rdp->nxttail[i]) - rdp->nxttail[i] = tail; - else - break; - } - smp_mb(); /* List handling before counting for rcu_barrier(). */ - rdp->qlen_lazy -= count_lazy; - ACCESS_ONCE(rdp->qlen) -= count; - rdp->n_cbs_invoked += count; - - /* Reinstate batch limit if we have worked down the excess. */ - if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) - rdp->blimit = blimit; - - /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ - if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { - rdp->qlen_last_fqs_check = 0; - rdp->n_force_qs_snap = rsp->n_force_qs; - } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) - rdp->qlen_last_fqs_check = rdp->qlen; - WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); - - local_irq_restore(flags); - - /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) - invoke_rcu_core(); -} - -/* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. If rcu_pending returns - * false, there is no point in invoking rcu_check_callbacks(). - */ -void rcu_check_callbacks(int cpu, int user) -{ - trace_rcu_utilization(TPS("Start scheduler-tick")); - increment_cpu_stall_ticks(); - if (user || rcu_is_cpu_rrupt_from_idle()) { - - /* - * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. - * - * No memory barrier is required here because both - * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local - * variables that other CPUs neither access nor modify, - * at least not while the corresponding CPU is online. - */ - - rcu_sched_qs(cpu); - rcu_bh_qs(cpu); - - } else if (!in_softirq()) { - - /* - * Get here if this CPU did not take its interrupt from - * softirq, in other words, if it is not interrupting - * a rcu_bh read-side critical section. This is an _bh - * critical section, so note it. - */ - - rcu_bh_qs(cpu); - } - rcu_preempt_check_callbacks(cpu); - if (rcu_pending(cpu)) - invoke_rcu_core(); - trace_rcu_utilization(TPS("End scheduler-tick")); -} - -/* - * Scan the leaf rcu_node structures, processing dyntick state for any that - * have not yet encountered a quiescent state, using the function specified. - * Also initiate boosting for any threads blocked on the root rcu_node. - * - * The caller must have suppressed start of new grace periods. - */ -static void force_qs_rnp(struct rcu_state *rsp, - int (*f)(struct rcu_data *rsp, bool *isidle, - unsigned long *maxj), - bool *isidle, unsigned long *maxj) -{ - unsigned long bit; - int cpu; - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rsp, rnp) { - cond_resched(); - mask = 0; - raw_spin_lock_irqsave(&rnp->lock, flags); - if (!rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - if (rnp->qsmask == 0) { - rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ - continue; - } - cpu = rnp->grplo; - bit = 1; - for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0) { - if ((rnp->qsmaskinit & bit) != 0) - *isidle = 0; - if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) - mask |= bit; - } - } - if (mask != 0) { - - /* rcu_report_qs_rnp() releases rnp->lock. */ - rcu_report_qs_rnp(mask, rsp, rnp, flags); - continue; - } - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } - rnp = rcu_get_root(rsp); - if (rnp->qsmask == 0) { - raw_spin_lock_irqsave(&rnp->lock, flags); - rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ - } -} - -/* - * Force quiescent states on reluctant CPUs, and also detect which - * CPUs are in dyntick-idle mode. - */ -static void force_quiescent_state(struct rcu_state *rsp) -{ - unsigned long flags; - bool ret; - struct rcu_node *rnp; - struct rcu_node *rnp_old = NULL; - - /* Funnel through hierarchy to reduce memory contention. */ - rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; - for (; rnp != NULL; rnp = rnp->parent) { - ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || - !raw_spin_trylock(&rnp->fqslock); - if (rnp_old != NULL) - raw_spin_unlock(&rnp_old->fqslock); - if (ret) { - rsp->n_force_qs_lh++; - return; - } - rnp_old = rnp; - } - /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ - - /* Reached the root of the rcu_node tree, acquire lock. */ - raw_spin_lock_irqsave(&rnp_old->lock, flags); - raw_spin_unlock(&rnp_old->fqslock); - if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - rsp->n_force_qs_lh++; - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - return; /* Someone beat us to it. */ - } - rsp->gp_flags |= RCU_GP_FLAG_FQS; - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ -} - -/* - * This does the RCU core processing work for the specified rcu_state - * and rcu_data structures. This may be called only from the CPU to - * whom the rdp belongs. - */ -static void -__rcu_process_callbacks(struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - - WARN_ON_ONCE(rdp->beenonline == 0); - - /* Update RCU state based on any recent quiescent states. */ - rcu_check_quiescent_state(rsp, rdp); - - /* Does this CPU require a not-yet-started grace period? */ - local_irq_save(flags); - if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ - rcu_start_gp(rsp); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - } else { - local_irq_restore(flags); - } - - /* If there are callbacks ready, invoke them. */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) - invoke_rcu_callbacks(rsp, rdp); -} - -/* - * Do RCU core processing for the current CPU. - */ -static void rcu_process_callbacks(struct softirq_action *unused) -{ - struct rcu_state *rsp; - - if (cpu_is_offline(smp_processor_id())) - return; - trace_rcu_utilization(TPS("Start RCU core")); - for_each_rcu_flavor(rsp) - __rcu_process_callbacks(rsp); - trace_rcu_utilization(TPS("End RCU core")); -} - -/* - * Schedule RCU callback invocation. If the specified type of RCU - * does not support RCU priority boosting, just do a direct call, - * otherwise wake up the per-CPU kernel kthread. Note that because we - * are running on the current CPU with interrupts disabled, the - * rcu_cpu_kthread_task cannot disappear out from under us. - */ -static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) -{ - if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) - return; - if (likely(!rsp->boost)) { - rcu_do_batch(rsp, rdp); - return; - } - invoke_rcu_callbacks_kthread(); -} - -static void invoke_rcu_core(void) -{ - if (cpu_online(smp_processor_id())) - raise_softirq(RCU_SOFTIRQ); -} - -/* - * Handle any core-RCU processing required by a call_rcu() invocation. - */ -static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, - struct rcu_head *head, unsigned long flags) -{ - /* - * If called from an extended quiescent state, invoke the RCU - * core in order to force a re-evaluation of RCU's idleness. - */ - if (!rcu_is_watching() && cpu_online(smp_processor_id())) - invoke_rcu_core(); - - /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ - if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) - return; - - /* - * Force the grace period if too many callbacks or too long waiting. - * Enforce hysteresis, and don't invoke force_quiescent_state() - * if some other CPU has recently done so. Also, don't bother - * invoking force_quiescent_state() if the newly enqueued callback - * is the only one waiting for a grace period to complete. - */ - if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { - - /* Are we ignoring a completed grace period? */ - note_gp_changes(rsp, rdp); - - /* Start a new grace period if one not already started. */ - if (!rcu_gp_in_progress(rsp)) { - struct rcu_node *rnp_root = rcu_get_root(rsp); - - raw_spin_lock(&rnp_root->lock); - rcu_start_gp(rsp); - raw_spin_unlock(&rnp_root->lock); - } else { - /* Give the grace period a kick. */ - rdp->blimit = LONG_MAX; - if (rsp->n_force_qs == rdp->n_force_qs_snap && - *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp); - rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->qlen_last_fqs_check = rdp->qlen; - } - } -} - -/* - * RCU callback function to leak a callback. - */ -static void rcu_leak_callback(struct rcu_head *rhp) -{ -} - -/* - * Helper function for call_rcu() and friends. The cpu argument will - * normally be -1, indicating "currently running CPU". It may specify - * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() - * is expected to specify a CPU. - */ -static void -__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp, int cpu, bool lazy) -{ - unsigned long flags; - struct rcu_data *rdp; - - WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ - if (debug_rcu_head_queue(head)) { - /* Probable double call_rcu(), so leak the callback. */ - ACCESS_ONCE(head->func) = rcu_leak_callback; - WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); - return; - } - head->func = func; - head->next = NULL; - - /* - * Opportunistically note grace-period endings and beginnings. - * Note that we might see a beginning right after we see an - * end, but never vice versa, since this CPU has to pass through - * a quiescent state betweentimes. - */ - local_irq_save(flags); - rdp = this_cpu_ptr(rsp->rda); - - /* Add the callback to our list. */ - if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { - int offline; - - if (cpu != -1) - rdp = per_cpu_ptr(rsp->rda, cpu); - offline = !__call_rcu_nocb(rdp, head, lazy); - WARN_ON_ONCE(offline); - /* _call_rcu() is illegal on offline CPU; leak the callback. */ - local_irq_restore(flags); - return; - } - ACCESS_ONCE(rdp->qlen)++; - if (lazy) - rdp->qlen_lazy++; - else - rcu_idle_count_callbacks_posted(); - smp_mb(); /* Count before adding callback for rcu_barrier(). */ - *rdp->nxttail[RCU_NEXT_TAIL] = head; - rdp->nxttail[RCU_NEXT_TAIL] = &head->next; - - if (__is_kfree_rcu_offset((unsigned long)func)) - trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, - rdp->qlen_lazy, rdp->qlen); - else - trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); - - /* Go handle any RCU core processing required. */ - __call_rcu_core(rsp, rdp, head, flags); - local_irq_restore(flags); -} - -/* - * Queue an RCU-sched callback for invocation after a grace period. - */ -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_sched_state, -1, 0); -} -EXPORT_SYMBOL_GPL(call_rcu_sched); - -/* - * Queue an RCU callback for invocation after a quicker grace period. - */ -void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_bh_state, -1, 0); -} -EXPORT_SYMBOL_GPL(call_rcu_bh); - -/* - * Because a context switch is a grace period for RCU-sched and RCU-bh, - * any blocking grace-period wait automatically implies a grace period - * if there is only one CPU online at any point time during execution - * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to - * occasionally incorrectly indicate that there are multiple CPUs online - * when there was in fact only one the whole time, as this just adds - * some overhead: RCU still operates correctly. - */ -static inline int rcu_blocking_is_gp(void) -{ - int ret; - - might_sleep(); /* Check for RCU read-side critical section. */ - preempt_disable(); - ret = num_online_cpus() <= 1; - preempt_enable(); - return ret; -} - -/** - * synchronize_sched - wait until an rcu-sched grace period has elapsed. - * - * Control will return to the caller some time after a full rcu-sched - * grace period has elapsed, in other words after all currently executing - * rcu-sched read-side critical sections have completed. These read-side - * critical sections are delimited by rcu_read_lock_sched() and - * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), - * local_irq_disable(), and so on may be used in place of - * rcu_read_lock_sched(). - * - * This means that all preempt_disable code sequences, including NMI and - * non-threaded hardware-interrupt handlers, in progress on entry will - * have completed before this primitive returns. However, this does not - * guarantee that softirq handlers will have completed, since in some - * kernels, these handlers can run in process context, and can block. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_sched() returns, - * each CPU is guaranteed to have executed a full memory barrier since the - * end of its last RCU-sched read-side critical section whose beginning - * preceded the call to synchronize_sched(). In addition, each CPU having - * an RCU read-side critical section that extends beyond the return from - * synchronize_sched() is guaranteed to have executed a full memory barrier - * after the beginning of synchronize_sched() and before the beginning of - * that RCU read-side critical section. Note that these guarantees include - * CPUs that are offline, idle, or executing in user mode, as well as CPUs - * that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_sched(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - * - * This primitive provides the guarantees made by the (now removed) - * synchronize_kernel() API. In contrast, synchronize_rcu() only - * guarantees that rcu_read_lock() sections will have completed. - * In "classic RCU", these two guarantees happen to be one and - * the same, but can differ in realtime RCU implementations. - */ -void synchronize_sched(void) -{ - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU-sched read-side critical section"); - if (rcu_blocking_is_gp()) - return; - if (rcu_expedited) - synchronize_sched_expedited(); - else - wait_rcu_gp(call_rcu_sched); -} -EXPORT_SYMBOL_GPL(synchronize_sched); - -/** - * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. - * - * Control will return to the caller some time after a full rcu_bh grace - * period has elapsed, in other words after all currently executing rcu_bh - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), - * and may be nested. - * - * See the description of synchronize_sched() for more detailed information - * on memory ordering guarantees. - */ -void synchronize_rcu_bh(void) -{ - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); - if (rcu_blocking_is_gp()) - return; - if (rcu_expedited) - synchronize_rcu_bh_expedited(); - else - wait_rcu_gp(call_rcu_bh); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_bh); - -static int synchronize_sched_expedited_cpu_stop(void *data) -{ - /* - * There must be a full memory barrier on each affected CPU - * between the time that try_stop_cpus() is called and the - * time that it returns. - * - * In the current initial implementation of cpu_stop, the - * above condition is already met when the control reaches - * this point and the following smp_mb() is not strictly - * necessary. Do smp_mb() anyway for documentation and - * robustness against future implementation changes. - */ - smp_mb(); /* See above comment block. */ - return 0; -} - -/** - * synchronize_sched_expedited - Brute-force RCU-sched grace period - * - * Wait for an RCU-sched grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_sched_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_sched() instead. - * - * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal - * to call this function from a CPU-hotplug notifier. Failing to observe - * these restriction will result in deadlock. - * - * This implementation can be thought of as an application of ticket - * locking to RCU, with sync_sched_expedited_started and - * sync_sched_expedited_done taking on the roles of the halves - * of the ticket-lock word. Each task atomically increments - * sync_sched_expedited_started upon entry, snapshotting the old value, - * then attempts to stop all the CPUs. If this succeeds, then each - * CPU will have executed a context switch, resulting in an RCU-sched - * grace period. We are then done, so we use atomic_cmpxchg() to - * update sync_sched_expedited_done to match our snapshot -- but - * only if someone else has not already advanced past our snapshot. - * - * On the other hand, if try_stop_cpus() fails, we check the value - * of sync_sched_expedited_done. If it has advanced past our - * initial snapshot, then someone else must have forced a grace period - * some time after we took our snapshot. In this case, our work is - * done for us, and we can simply return. Otherwise, we try again, - * but keep our initial snapshot for purposes of checking for someone - * doing our work for us. - * - * If we fail too many times in a row, we fall back to synchronize_sched(). - */ -void synchronize_sched_expedited(void) -{ - long firstsnap, s, snap; - int trycount = 0; - struct rcu_state *rsp = &rcu_sched_state; - - /* - * If we are in danger of counter wrap, just do synchronize_sched(). - * By allowing sync_sched_expedited_started to advance no more than - * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring - * that more than 3.5 billion CPUs would be required to force a - * counter wrap on a 32-bit system. Quite a few more CPUs would of - * course be required on a 64-bit system. - */ - if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), - (ulong)atomic_long_read(&rsp->expedited_done) + - ULONG_MAX / 8)) { - synchronize_sched(); - atomic_long_inc(&rsp->expedited_wrap); - return; - } - - /* - * Take a ticket. Note that atomic_inc_return() implies a - * full memory barrier. - */ - snap = atomic_long_inc_return(&rsp->expedited_start); - firstsnap = snap; - get_online_cpus(); - WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); - - /* - * Each pass through the following loop attempts to force a - * context switch on each CPU. - */ - while (try_stop_cpus(cpu_online_mask, - synchronize_sched_expedited_cpu_stop, - NULL) == -EAGAIN) { - put_online_cpus(); - atomic_long_inc(&rsp->expedited_tryfail); - - /* Check to see if someone else did our work for us. */ - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic_inc(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_workdone1); - return; - } - - /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) { - udelay(trycount * num_online_cpus()); - } else { - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - return; - } - - /* Recheck to see if someone else did our work for us. */ - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic_inc(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_workdone2); - return; - } - - /* - * Refetching sync_sched_expedited_started allows later - * callers to piggyback on our grace period. We retry - * after they started, so our grace period works for them, - * and they started after our first try, so their grace - * period works for us. - */ - get_online_cpus(); - snap = atomic_long_read(&rsp->expedited_start); - smp_mb(); /* ensure read is before try_stop_cpus(). */ - } - atomic_long_inc(&rsp->expedited_stoppedcpus); - - /* - * Everyone up to our most recent fetch is covered by our grace - * period. Update the counter, but only if our work is still - * relevant -- which it won't be if someone who started later - * than we did already did their update. - */ - do { - atomic_long_inc(&rsp->expedited_done_tries); - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic_inc(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_done_lost); - break; - } - } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); - atomic_long_inc(&rsp->expedited_done_exit); - - put_online_cpus(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -/* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, for the specified type of RCU, returning 1 if so. - * The checks are in order of increasing expense: checks that can be - * carried out against CPU-local state are performed first. However, - * we must check for CPU stalls first, else we might not get a chance. - */ -static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) -{ - struct rcu_node *rnp = rdp->mynode; - - rdp->n_rcu_pending++; - - /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rsp, rdp); - - /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rcu_scheduler_fully_active && - rdp->qs_pending && !rdp->passed_quiesce) { - rdp->n_rp_qs_pending++; - } else if (rdp->qs_pending && rdp->passed_quiesce) { - rdp->n_rp_report_qs++; - return 1; - } - - /* Does this CPU have callbacks ready to invoke? */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) { - rdp->n_rp_cb_ready++; - return 1; - } - - /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) { - rdp->n_rp_cpu_needs_gp++; - return 1; - } - - /* Has another RCU grace period completed? */ - if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ - rdp->n_rp_gp_completed++; - return 1; - } - - /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ - rdp->n_rp_gp_started++; - return 1; - } - - /* nothing to do */ - rdp->n_rp_need_nothing++; - return 0; -} - -/* - * Check to see if there is any immediate RCU-related work to be done - * by the current CPU, returning 1 if so. This function is part of the - * RCU implementation; it is -not- an exported member of the RCU API. - */ -static int rcu_pending(int cpu) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) - return 1; - return 0; -} - -/* - * Return true if the specified CPU has any callback. If all_lazy is - * non-NULL, store an indication of whether all callbacks are lazy. - * (If there are no callbacks, all of them are deemed to be lazy.) - */ -static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) -{ - bool al = true; - bool hc = false; - struct rcu_data *rdp; - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (!rdp->nxtlist) - continue; - hc = true; - if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { - al = false; - break; - } - } - if (all_lazy) - *all_lazy = al; - return hc; -} - -/* - * Helper function for _rcu_barrier() tracing. If tracing is disabled, - * the compiler is expected to optimize this away. - */ -static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, - int cpu, unsigned long done) -{ - trace_rcu_barrier(rsp->name, s, cpu, - atomic_read(&rsp->barrier_cpu_count), done); -} - -/* - * RCU callback function for _rcu_barrier(). If we are last, wake - * up the task executing _rcu_barrier(). - */ -static void rcu_barrier_callback(struct rcu_head *rhp) -{ - struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); - struct rcu_state *rsp = rdp->rsp; - - if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); - complete(&rsp->barrier_completion); - } else { - _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); - } -} - -/* - * Called with preemption disabled, and from cross-cpu IRQ context. - */ -static void rcu_barrier_func(void *type) -{ - struct rcu_state *rsp = type; - struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); - - _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); - atomic_inc(&rsp->barrier_cpu_count); - rsp->call(&rdp->barrier_head, rcu_barrier_callback); -} - -/* - * Orchestrate the specified type of RCU barrier, waiting for all - * RCU callbacks of the specified type to complete. - */ -static void _rcu_barrier(struct rcu_state *rsp) -{ - int cpu; - struct rcu_data *rdp; - unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); - unsigned long snap_done; - - _rcu_barrier_trace(rsp, "Begin", -1, snap); - - /* Take mutex to serialize concurrent rcu_barrier() requests. */ - mutex_lock(&rsp->barrier_mutex); - - /* - * Ensure that all prior references, including to ->n_barrier_done, - * are ordered before the _rcu_barrier() machinery. - */ - smp_mb(); /* See above block comment. */ - - /* - * Recheck ->n_barrier_done to see if others did our work for us. - * This means checking ->n_barrier_done for an even-to-odd-to-even - * transition. The "if" expression below therefore rounds the old - * value up to the next even number and adds two before comparing. - */ - snap_done = rsp->n_barrier_done; - _rcu_barrier_trace(rsp, "Check", -1, snap_done); - - /* - * If the value in snap is odd, we needed to wait for the current - * rcu_barrier() to complete, then wait for the next one, in other - * words, we need the value of snap_done to be three larger than - * the value of snap. On the other hand, if the value in snap is - * even, we only had to wait for the next rcu_barrier() to complete, - * in other words, we need the value of snap_done to be only two - * greater than the value of snap. The "(snap + 3) & ~0x1" computes - * this for us (thank you, Linus!). - */ - if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { - _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); - smp_mb(); /* caller's subsequent code after above check. */ - mutex_unlock(&rsp->barrier_mutex); - return; - } - - /* - * Increment ->n_barrier_done to avoid duplicate work. Use - * ACCESS_ONCE() to prevent the compiler from speculating - * the increment to precede the early-exit check. - */ - ACCESS_ONCE(rsp->n_barrier_done)++; - WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); - _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); - smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ - - /* - * Initialize the count to one rather than to zero in order to - * avoid a too-soon return to zero in case of a short grace period - * (or preemption of this task). Exclude CPU-hotplug operations - * to ensure that no offline CPU has callbacks queued. - */ - init_completion(&rsp->barrier_completion); - atomic_set(&rsp->barrier_cpu_count, 1); - get_online_cpus(); - - /* - * Force each CPU with callbacks to register a new callback. - * When that callback is invoked, we will know that all of the - * corresponding CPU's preceding callbacks have been invoked. - */ - for_each_possible_cpu(cpu) { - if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) - continue; - rdp = per_cpu_ptr(rsp->rda, cpu); - if (rcu_is_nocb_cpu(cpu)) { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); - atomic_inc(&rsp->barrier_cpu_count); - __call_rcu(&rdp->barrier_head, rcu_barrier_callback, - rsp, cpu, 0); - } else if (ACCESS_ONCE(rdp->qlen)) { - _rcu_barrier_trace(rsp, "OnlineQ", cpu, - rsp->n_barrier_done); - smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); - } else { - _rcu_barrier_trace(rsp, "OnlineNQ", cpu, - rsp->n_barrier_done); - } - } - put_online_cpus(); - - /* - * Now that we have an rcu_barrier_callback() callback on each - * CPU, and thus each counted, remove the initial count. - */ - if (atomic_dec_and_test(&rsp->barrier_cpu_count)) - complete(&rsp->barrier_completion); - - /* Increment ->n_barrier_done to prevent duplicate work. */ - smp_mb(); /* Keep increment after above mechanism. */ - ACCESS_ONCE(rsp->n_barrier_done)++; - WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); - _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); - smp_mb(); /* Keep increment before caller's subsequent code. */ - - /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ - wait_for_completion(&rsp->barrier_completion); - - /* Other rcu_barrier() invocations can now safely proceed. */ - mutex_unlock(&rsp->barrier_mutex); -} - -/** - * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. - */ -void rcu_barrier_bh(void) -{ - _rcu_barrier(&rcu_bh_state); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -/** - * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. - */ -void rcu_barrier_sched(void) -{ - _rcu_barrier(&rcu_sched_state); -} -EXPORT_SYMBOL_GPL(rcu_barrier_sched); - -/* - * Do boot-time initialization of a CPU's per-CPU RCU data. - */ -static void __init -rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rcu_get_root(rsp); - - /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); - init_callback_list(rdp); - rdp->qlen_lazy = 0; - ACCESS_ONCE(rdp->qlen) = 0; - rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); - WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); - rdp->cpu = cpu; - rdp->rsp = rsp; - rcu_boot_init_nocb_percpu_data(rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Initialize a CPU's per-CPU RCU data. Note that only one online or - * offline event can be happening at a given time. Note also that we - * can accept some slop in the rsp->completed access due to the fact - * that this CPU cannot possibly have any RCU callbacks in flight yet. - */ -static void -rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) -{ - unsigned long flags; - unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rcu_get_root(rsp); - - /* Exclude new grace periods. */ - mutex_lock(&rsp->onoff_mutex); - - /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - rdp->beenonline = 1; /* We have now been online. */ - rdp->preemptible = preemptible; - rdp->qlen_last_fqs_check = 0; - rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->blimit = blimit; - init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ - rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_sysidle_init_percpu_data(rdp->dynticks); - atomic_set(&rdp->dynticks->dynticks, - (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - - /* Add CPU to rcu_node bitmasks. */ - rnp = rdp->mynode; - mask = rdp->grpmask; - do { - /* Exclude any attempts to start a new GP on small systems. */ - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->qsmaskinit |= mask; - mask = rnp->grpmask; - if (rnp == rdp->mynode) { - /* - * If there is a grace period in progress, we will - * set up to wait for it next time we run the - * RCU core code. - */ - rdp->gpnum = rnp->completed; - rdp->completed = rnp->completed; - rdp->passed_quiesce = 0; - rdp->qs_pending = 0; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); - } - raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ - rnp = rnp->parent; - } while (rnp != NULL && !(rnp->qsmaskinit & mask)); - local_irq_restore(flags); - - mutex_unlock(&rsp->onoff_mutex); -} - -static void rcu_prepare_cpu(int cpu) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - rcu_init_percpu_data(cpu, rsp, - strcmp(rsp->name, "rcu_preempt") == 0); -} - -/* - * Handle CPU online/offline notification events. - */ -static int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); - struct rcu_node *rnp = rdp->mynode; - struct rcu_state *rsp; - - trace_rcu_utilization(TPS("Start CPU hotplug")); - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - rcu_prepare_cpu(cpu); - rcu_prepare_kthreads(cpu); - break; - case CPU_ONLINE: - case CPU_DOWN_FAILED: - rcu_boost_kthread_setaffinity(rnp, -1); - break; - case CPU_DOWN_PREPARE: - rcu_boost_kthread_setaffinity(rnp, cpu); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: - for_each_rcu_flavor(rsp) - rcu_cleanup_dying_cpu(rsp); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - for_each_rcu_flavor(rsp) - rcu_cleanup_dead_cpu(cpu, rsp); - break; - default: - break; - } - trace_rcu_utilization(TPS("End CPU hotplug")); - return NOTIFY_OK; -} - -static int rcu_pm_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - switch (action) { - case PM_HIBERNATION_PREPARE: - case PM_SUSPEND_PREPARE: - if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_expedited = 1; - break; - case PM_POST_HIBERNATION: - case PM_POST_SUSPEND: - rcu_expedited = 0; - break; - default: - break; - } - return NOTIFY_OK; -} - -/* - * Spawn the kthread that handles this RCU flavor's grace periods. - */ -static int __init rcu_spawn_gp_kthread(void) -{ - unsigned long flags; - struct rcu_node *rnp; - struct rcu_state *rsp; - struct task_struct *t; - - for_each_rcu_flavor(rsp) { - t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); - BUG_ON(IS_ERR(t)); - rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); - rsp->gp_kthread = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - rcu_spawn_nocb_kthreads(rsp); - } - return 0; -} -early_initcall(rcu_spawn_gp_kthread); - -/* - * This function is invoked towards the end of the scheduler's initialization - * process. Before this is called, the idle task might contain - * RCU read-side critical sections (during which time, this idle - * task is booting the system). After this function is called, the - * idle tasks are prohibited from containing RCU read-side critical - * sections. This function also enables RCU lockdep checking. - */ -void rcu_scheduler_starting(void) -{ - WARN_ON(num_online_cpus() != 1); - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; -} - -/* - * Compute the per-level fanout, either using the exact fanout specified - * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. - */ -#ifdef CONFIG_RCU_FANOUT_EXACT -static void __init rcu_init_levelspread(struct rcu_state *rsp) -{ - int i; - - for (i = rcu_num_lvls - 1; i > 0; i--) - rsp->levelspread[i] = CONFIG_RCU_FANOUT; - rsp->levelspread[0] = rcu_fanout_leaf; -} -#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ -static void __init rcu_init_levelspread(struct rcu_state *rsp) -{ - int ccur; - int cprv; - int i; - - cprv = nr_cpu_ids; - for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = rsp->levelcnt[i]; - rsp->levelspread[i] = (cprv + ccur - 1) / ccur; - cprv = ccur; - } -} -#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */ - -/* - * Helper function for rcu_init() that initializes one rcu_state structure. - */ -static void __init rcu_init_one(struct rcu_state *rsp, - struct rcu_data __percpu *rda) -{ - static char *buf[] = { "rcu_node_0", - "rcu_node_1", - "rcu_node_2", - "rcu_node_3" }; /* Match MAX_RCU_LVLS */ - static char *fqs[] = { "rcu_node_fqs_0", - "rcu_node_fqs_1", - "rcu_node_fqs_2", - "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ - int cpustride = 1; - int i; - int j; - struct rcu_node *rnp; - - BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ - - /* Silence gcc 4.8 warning about array index out of range. */ - if (rcu_num_lvls > RCU_NUM_LVLS) - panic("rcu_init_one: rcu_num_lvls overflow"); - - /* Initialize the level-tracking arrays. */ - - for (i = 0; i < rcu_num_lvls; i++) - rsp->levelcnt[i] = num_rcu_lvl[i]; - for (i = 1; i < rcu_num_lvls; i++) - rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; - rcu_init_levelspread(rsp); - - /* Initialize the elements themselves, starting from the leaves. */ - - for (i = rcu_num_lvls - 1; i >= 0; i--) { - cpustride *= rsp->levelspread[i]; - rnp = rsp->level[i]; - for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { - raw_spin_lock_init(&rnp->lock); - lockdep_set_class_and_name(&rnp->lock, - &rcu_node_class[i], buf[i]); - raw_spin_lock_init(&rnp->fqslock); - lockdep_set_class_and_name(&rnp->fqslock, - &rcu_fqs_class[i], fqs[i]); - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; - rnp->qsmask = 0; - rnp->qsmaskinit = 0; - rnp->grplo = j * cpustride; - rnp->grphi = (j + 1) * cpustride - 1; - if (rnp->grphi >= NR_CPUS) - rnp->grphi = NR_CPUS - 1; - if (i == 0) { - rnp->grpnum = 0; - rnp->grpmask = 0; - rnp->parent = NULL; - } else { - rnp->grpnum = j % rsp->levelspread[i - 1]; - rnp->grpmask = 1UL << rnp->grpnum; - rnp->parent = rsp->level[i - 1] + - j / rsp->levelspread[i - 1]; - } - rnp->level = i; - INIT_LIST_HEAD(&rnp->blkd_tasks); - rcu_init_one_nocb(rnp); - } - } - - rsp->rda = rda; - init_waitqueue_head(&rsp->gp_wq); - init_irq_work(&rsp->wakeup_work, rsp_wakeup); - rnp = rsp->level[rcu_num_lvls - 1]; - for_each_possible_cpu(i) { - while (i > rnp->grphi) - rnp++; - per_cpu_ptr(rsp->rda, i)->mynode = rnp; - rcu_boot_init_percpu_data(i, rsp); - } - list_add(&rsp->flavors, &rcu_struct_flavors); -} - -/* - * Compute the rcu_node tree geometry from kernel parameters. This cannot - * replace the definitions in rcutree.h because those are needed to size - * the ->node array in the rcu_state structure. - */ -static void __init rcu_init_geometry(void) -{ - ulong d; - int i; - int j; - int n = nr_cpu_ids; - int rcu_capacity[MAX_RCU_LVLS + 1]; - - /* - * Initialize any unspecified boot parameters. - * The default values of jiffies_till_first_fqs and - * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS - * value, which is a function of HZ, then adding one for each - * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. - */ - d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; - if (jiffies_till_first_fqs == ULONG_MAX) - jiffies_till_first_fqs = d; - if (jiffies_till_next_fqs == ULONG_MAX) - jiffies_till_next_fqs = d; - - /* If the compile-time values are accurate, just leave. */ - if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && - nr_cpu_ids == NR_CPUS) - return; - - /* - * Compute number of nodes that can be handled an rcu_node tree - * with the given number of levels. Setting rcu_capacity[0] makes - * some of the arithmetic easier. - */ - rcu_capacity[0] = 1; - rcu_capacity[1] = rcu_fanout_leaf; - for (i = 2; i <= MAX_RCU_LVLS; i++) - rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; - - /* - * The boot-time rcu_fanout_leaf parameter is only permitted - * to increase the leaf-level fanout, not decrease it. Of course, - * the leaf-level fanout cannot exceed the number of bits in - * the rcu_node masks. Finally, the tree must be able to accommodate - * the configured number of CPUs. Complain and fall back to the - * compile-time values if these limits are exceeded. - */ - if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || - rcu_fanout_leaf > sizeof(unsigned long) * 8 || - n > rcu_capacity[MAX_RCU_LVLS]) { - WARN_ON(1); - return; - } - - /* Calculate the number of rcu_nodes at each level of the tree. */ - for (i = 1; i <= MAX_RCU_LVLS; i++) - if (n <= rcu_capacity[i]) { - for (j = 0; j <= i; j++) - num_rcu_lvl[j] = - DIV_ROUND_UP(n, rcu_capacity[i - j]); - rcu_num_lvls = i; - for (j = i + 1; j <= MAX_RCU_LVLS; j++) - num_rcu_lvl[j] = 0; - break; - } - - /* Calculate the total number of rcu_node structures. */ - rcu_num_nodes = 0; - for (i = 0; i <= MAX_RCU_LVLS; i++) - rcu_num_nodes += num_rcu_lvl[i]; - rcu_num_nodes -= n; -} - -void __init rcu_init(void) -{ - int cpu; - - rcu_bootup_announce(); - rcu_init_geometry(); - rcu_init_one(&rcu_bh_state, &rcu_bh_data); - rcu_init_one(&rcu_sched_state, &rcu_sched_data); - __rcu_init_preempt(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - - /* - * We don't need protection against CPU-hotplug here because - * this is called early in boot, before either interrupts - * or the scheduler are operational. - */ - cpu_notifier(rcu_cpu_notify, 0); - pm_notifier(rcu_pm_notify, 0); - for_each_online_cpu(cpu) - rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); -} - -#include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h deleted file mode 100644 index 52be957c9fe2..000000000000 --- a/kernel/rcutree.h +++ /dev/null @@ -1,585 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion (tree-based version) - * Internal non-public definitions. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2008 - * - * Author: Ingo Molnar - * Paul E. McKenney - */ - -#include -#include -#include -#include -#include -#include - -/* - * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and - * CONFIG_RCU_FANOUT_LEAF. - * In theory, it should be possible to add more levels straightforwardly. - * In practice, this did work well going from three levels to four. - * Of course, your mileage may vary. - */ -#define MAX_RCU_LVLS 4 -#define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) -#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) -#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) -#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) - -#if NR_CPUS <= RCU_FANOUT_1 -# define RCU_NUM_LVLS 1 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (NR_CPUS) -# define NUM_RCU_LVL_2 0 -# define NUM_RCU_LVL_3 0 -# define NUM_RCU_LVL_4 0 -#elif NR_CPUS <= RCU_FANOUT_2 -# define RCU_NUM_LVLS 2 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_2 (NR_CPUS) -# define NUM_RCU_LVL_3 0 -# define NUM_RCU_LVL_4 0 -#elif NR_CPUS <= RCU_FANOUT_3 -# define RCU_NUM_LVLS 3 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_3 (NR_CPUS) -# define NUM_RCU_LVL_4 0 -#elif NR_CPUS <= RCU_FANOUT_4 -# define RCU_NUM_LVLS 4 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) -# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_4 (NR_CPUS) -#else -# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" -#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ - -#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) -#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) - -extern int rcu_num_lvls; -extern int rcu_num_nodes; - -/* - * Dynticks per-CPU state. - */ -struct rcu_dynticks { - long long dynticks_nesting; /* Track irq/process nesting level. */ - /* Process level is worth LLONG_MAX/2. */ - int dynticks_nmi_nesting; /* Track NMI nesting level. */ - atomic_t dynticks; /* Even value for idle, else odd. */ -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - long long dynticks_idle_nesting; - /* irq/process nesting level from idle. */ - atomic_t dynticks_idle; /* Even value for idle, else odd. */ - /* "Idle" excludes userspace execution. */ - unsigned long dynticks_idle_jiffies; - /* End of last non-NMI non-idle period. */ -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -#ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* Are all CPU's CBs lazy? */ - unsigned long nonlazy_posted; - /* # times non-lazy CBs posted to CPU. */ - unsigned long nonlazy_posted_snap; - /* idle-period nonlazy_posted snapshot. */ - unsigned long last_accelerate; - /* Last jiffy CBs were accelerated. */ - unsigned long last_advance_all; - /* Last jiffy CBs were all advanced. */ - int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ -#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -}; - -/* RCU's kthread states for tracing. */ -#define RCU_KTHREAD_STOPPED 0 -#define RCU_KTHREAD_RUNNING 1 -#define RCU_KTHREAD_WAITING 2 -#define RCU_KTHREAD_OFFCPU 3 -#define RCU_KTHREAD_YIELDING 4 -#define RCU_KTHREAD_MAX 4 - -/* - * Definition for node within the RCU grace-period-detection hierarchy. - */ -struct rcu_node { - raw_spinlock_t lock; /* Root rcu_node's lock protects some */ - /* rcu_state fields as well as following. */ - unsigned long gpnum; /* Current grace period for this node. */ - /* This will either be equal to or one */ - /* behind the root rcu_node's gpnum. */ - unsigned long completed; /* Last GP completed for this node. */ - /* This will either be equal to or one */ - /* behind the root rcu_node's gpnum. */ - unsigned long qsmask; /* CPUs or groups that need to switch in */ - /* order for current grace period to proceed.*/ - /* In leaf rcu_node, each bit corresponds to */ - /* an rcu_data structure, otherwise, each */ - /* bit corresponds to a child rcu_node */ - /* structure. */ - unsigned long expmask; /* Groups that have ->blkd_tasks */ - /* elements that need to drain to allow the */ - /* current expedited grace period to */ - /* complete (only for TREE_PREEMPT_RCU). */ - unsigned long qsmaskinit; - /* Per-GP initial value for qsmask & expmask. */ - unsigned long grpmask; /* Mask to apply to parent qsmask. */ - /* Only one bit will be set in this mask. */ - int grplo; /* lowest-numbered CPU or group here. */ - int grphi; /* highest-numbered CPU or group here. */ - u8 grpnum; /* CPU/group number for next level up. */ - u8 level; /* root is at level 0. */ - struct rcu_node *parent; - struct list_head blkd_tasks; - /* Tasks blocked in RCU read-side critical */ - /* section. Tasks are placed at the head */ - /* of this list and age towards the tail. */ - struct list_head *gp_tasks; - /* Pointer to the first task blocking the */ - /* current grace period, or NULL if there */ - /* is no such task. */ - struct list_head *exp_tasks; - /* Pointer to the first task blocking the */ - /* current expedited grace period, or NULL */ - /* if there is no such task. If there */ - /* is no current expedited grace period, */ - /* then there can cannot be any such task. */ -#ifdef CONFIG_RCU_BOOST - struct list_head *boost_tasks; - /* Pointer to first task that needs to be */ - /* priority boosted, or NULL if no priority */ - /* boosting is needed for this rcu_node */ - /* structure. If there are no tasks */ - /* queued on this rcu_node structure that */ - /* are blocking the current grace period, */ - /* there can be no such task. */ - unsigned long boost_time; - /* When to start boosting (jiffies). */ - struct task_struct *boost_kthread_task; - /* kthread that takes care of priority */ - /* boosting for this rcu_node structure. */ - unsigned int boost_kthread_status; - /* State of boost_kthread_task for tracing. */ - unsigned long n_tasks_boosted; - /* Total number of tasks boosted. */ - unsigned long n_exp_boosts; - /* Number of tasks boosted for expedited GP. */ - unsigned long n_normal_boosts; - /* Number of tasks boosted for normal GP. */ - unsigned long n_balk_blkd_tasks; - /* Refused to boost: no blocked tasks. */ - unsigned long n_balk_exp_gp_tasks; - /* Refused to boost: nothing blocking GP. */ - unsigned long n_balk_boost_tasks; - /* Refused to boost: already boosting. */ - unsigned long n_balk_notblocked; - /* Refused to boost: RCU RS CS still running. */ - unsigned long n_balk_notyet; - /* Refused to boost: not yet time. */ - unsigned long n_balk_nos; - /* Refused to boost: not sure why, though. */ - /* This can happen due to race conditions. */ -#endif /* #ifdef CONFIG_RCU_BOOST */ -#ifdef CONFIG_RCU_NOCB_CPU - wait_queue_head_t nocb_gp_wq[2]; - /* Place for rcu_nocb_kthread() to wait GP. */ -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - int need_future_gp[2]; - /* Counts of upcoming no-CB GP requests. */ - raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; -} ____cacheline_internodealigned_in_smp; - -/* - * Do a full breadth-first scan of the rcu_node structures for the - * specified rcu_state structure. - */ -#define rcu_for_each_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) - -/* - * Do a breadth-first scan of the non-leaf rcu_node structures for the - * specified rcu_state structure. Note that if there is a singleton - * rcu_node tree with but one rcu_node structure, this loop is a no-op. - */ -#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) - -/* - * Scan the leaves of the rcu_node hierarchy for the specified rcu_state - * structure. Note that if there is a singleton rcu_node tree with but - * one rcu_node structure, this loop -will- visit the rcu_node structure. - * It is still a leaf node, even if it is also the root node. - */ -#define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) - -/* Index values for nxttail array in struct rcu_data. */ -#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ -#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ -#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ -#define RCU_NEXT_TAIL 3 -#define RCU_NEXT_SIZE 4 - -/* Per-CPU data for read-copy update. */ -struct rcu_data { - /* 1) quiescent-state and grace-period handling : */ - unsigned long completed; /* Track rsp->completed gp number */ - /* in order to detect GP end. */ - unsigned long gpnum; /* Highest gp number that this CPU */ - /* is aware of having started. */ - bool passed_quiesce; /* User-mode/idle loop etc. */ - bool qs_pending; /* Core waits for quiesc state. */ - bool beenonline; /* CPU online at least once. */ - bool preemptible; /* Preemptible RCU? */ - struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ - unsigned long grpmask; /* Mask to apply to leaf qsmask. */ -#ifdef CONFIG_RCU_CPU_STALL_INFO - unsigned long ticks_this_gp; /* The number of scheduling-clock */ - /* ticks this CPU has handled */ - /* during and after the last grace */ - /* period it is aware of. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - - /* 2) batch handling */ - /* - * If nxtlist is not NULL, it is partitioned as follows. - * Any of the partitions might be empty, in which case the - * pointer to that partition will be equal to the pointer for - * the following partition. When the list is empty, all of - * the nxttail elements point to the ->nxtlist pointer itself, - * which in that case is NULL. - * - * [nxtlist, *nxttail[RCU_DONE_TAIL]): - * Entries that batch # <= ->completed - * The grace period for these entries has completed, and - * the other grace-period-completed entries may be moved - * here temporarily in rcu_process_callbacks(). - * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): - * Entries that batch # <= ->completed - 1: waiting for current GP - * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): - * Entries known to have arrived before current GP ended - * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): - * Entries that might have arrived after current GP ended - * Note that the value of *nxttail[RCU_NEXT_TAIL] will - * always be NULL, as this is the end of the list. - */ - struct rcu_head *nxtlist; - struct rcu_head **nxttail[RCU_NEXT_SIZE]; - unsigned long nxtcompleted[RCU_NEXT_SIZE]; - /* grace periods for sublists. */ - long qlen_lazy; /* # of lazy queued callbacks */ - long qlen; /* # of queued callbacks, incl lazy */ - long qlen_last_fqs_check; - /* qlen at last check for QS forcing */ - unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ - unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ - unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ - unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ - unsigned long n_force_qs_snap; - /* did other CPU force QS recently? */ - long blimit; /* Upper limit on a processed batch */ - - /* 3) dynticks interface. */ - struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ - int dynticks_snap; /* Per-GP tracking for dynticks. */ - - /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ - unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ - unsigned long offline_fqs; /* Kicked due to being offline. */ - - /* 5) __rcu_pending() statistics. */ - unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ - unsigned long n_rp_qs_pending; - unsigned long n_rp_report_qs; - unsigned long n_rp_cb_ready; - unsigned long n_rp_cpu_needs_gp; - unsigned long n_rp_gp_completed; - unsigned long n_rp_gp_started; - unsigned long n_rp_need_nothing; - - /* 6) _rcu_barrier() and OOM callbacks. */ - struct rcu_head barrier_head; -#ifdef CONFIG_RCU_FAST_NO_HZ - struct rcu_head oom_head; -#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - - /* 7) Callback offloading. */ -#ifdef CONFIG_RCU_NOCB_CPU - struct rcu_head *nocb_head; /* CBs waiting for kthread. */ - struct rcu_head **nocb_tail; - atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ - atomic_long_t nocb_q_count_lazy; /* (approximate). */ - int nocb_p_count; /* # CBs being invoked by kthread */ - int nocb_p_count_lazy; /* (approximate). */ - wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ - struct task_struct *nocb_kthread; -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - - /* 8) RCU CPU stall data. */ -#ifdef CONFIG_RCU_CPU_STALL_INFO - unsigned int softirq_snap; /* Snapshot of softirq activity. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - - int cpu; - struct rcu_state *rsp; -}; - -/* Values for fqs_state field in struct rcu_state. */ -#define RCU_GP_IDLE 0 /* No grace period in progress. */ -#define RCU_GP_INIT 1 /* Grace period being initialized. */ -#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ -#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ -#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK - -#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) - /* For jiffies_till_first_fqs and */ - /* and jiffies_till_next_fqs. */ - -#define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */ - /* delay between bouts of */ - /* quiescent-state forcing. */ - -#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */ - /* at least one scheduling clock */ - /* irq before ratting on them. */ - -#define rcu_wait(cond) \ -do { \ - for (;;) { \ - set_current_state(TASK_INTERRUPTIBLE); \ - if (cond) \ - break; \ - schedule(); \ - } \ - __set_current_state(TASK_RUNNING); \ -} while (0) - -/* - * RCU global state, including node hierarchy. This hierarchy is - * represented in "heap" form in a dense array. The root (first level) - * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second - * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]), - * and the third level in ->node[m+1] and following (->node[m+1] referenced - * by ->level[2]). The number of levels is determined by the number of - * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy" - * consisting of a single rcu_node. - */ -struct rcu_state { - struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ - struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ - u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ - u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ - struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ - void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ - void (*func)(struct rcu_head *head)); - - /* The following fields are guarded by the root rcu_node's lock. */ - - u8 fqs_state ____cacheline_internodealigned_in_smp; - /* Force QS state. */ - u8 boost; /* Subject to priority boost. */ - unsigned long gpnum; /* Current gp number. */ - unsigned long completed; /* # of last completed gp. */ - struct task_struct *gp_kthread; /* Task for grace periods. */ - wait_queue_head_t gp_wq; /* Where GP task waits. */ - int gp_flags; /* Commands for GP task. */ - - /* End of fields guarded by root rcu_node's lock. */ - - raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; - /* Protect following fields. */ - struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ - /* need a grace period. */ - struct rcu_head **orphan_nxttail; /* Tail of above. */ - struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ - /* are ready to invoke. */ - struct rcu_head **orphan_donetail; /* Tail of above. */ - long qlen_lazy; /* Number of lazy callbacks. */ - long qlen; /* Total number of callbacks. */ - /* End of fields guarded by orphan_lock. */ - - struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ - - struct mutex barrier_mutex; /* Guards barrier fields. */ - atomic_t barrier_cpu_count; /* # CPUs waiting on. */ - struct completion barrier_completion; /* Wake at barrier end. */ - unsigned long n_barrier_done; /* ++ at start and end of */ - /* _rcu_barrier(). */ - /* End of fields guarded by barrier_mutex. */ - - atomic_long_t expedited_start; /* Starting ticket. */ - atomic_long_t expedited_done; /* Done ticket. */ - atomic_long_t expedited_wrap; /* # near-wrap incidents. */ - atomic_long_t expedited_tryfail; /* # acquisition failures. */ - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_normal; /* # fallbacks to normal. */ - atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ - atomic_long_t expedited_done_tries; /* # tries to update _done. */ - atomic_long_t expedited_done_lost; /* # times beaten to _done. */ - atomic_long_t expedited_done_exit; /* # times exited _done loop. */ - - unsigned long jiffies_force_qs; /* Time at which to invoke */ - /* force_quiescent_state(). */ - unsigned long n_force_qs; /* Number of calls to */ - /* force_quiescent_state(). */ - unsigned long n_force_qs_lh; /* ~Number of calls leaving */ - /* due to lock unavailable. */ - unsigned long n_force_qs_ngp; /* Number of calls leaving */ - /* due to no GP active. */ - unsigned long gp_start; /* Time at which GP started, */ - /* but in jiffies. */ - unsigned long jiffies_stall; /* Time at which to check */ - /* for CPU stalls. */ - unsigned long gp_max; /* Maximum GP duration in */ - /* jiffies. */ - const char *name; /* Name of structure. */ - char abbr; /* Abbreviated name. */ - struct list_head flavors; /* List of RCU flavors. */ - struct irq_work wakeup_work; /* Postponed wakeups */ -}; - -/* Values for rcu_state structure's gp_flags field. */ -#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ -#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ - -extern struct list_head rcu_struct_flavors; - -/* Sequence through rcu_state structures for each RCU flavor. */ -#define for_each_rcu_flavor(rsp) \ - list_for_each_entry((rsp), &rcu_struct_flavors, flavors) - -/* Return values for rcu_preempt_offline_tasks(). */ - -#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ - /* GP were moved to root. */ -#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ - /* GP were moved to root. */ - -/* - * RCU implementation internal declarations: - */ -extern struct rcu_state rcu_sched_state; -DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); - -extern struct rcu_state rcu_bh_state; -DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); - -#ifdef CONFIG_TREE_PREEMPT_RCU -extern struct rcu_state rcu_preempt_state; -DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - -#ifdef CONFIG_RCU_BOOST -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DECLARE_PER_CPU(char, rcu_cpu_has_work); -#endif /* #ifdef CONFIG_RCU_BOOST */ - -#ifndef RCU_TREE_NONCORE - -/* Forward declarations for rcutree_plugin.h */ -static void rcu_bootup_announce(void); -long rcu_batches_completed(void); -static void rcu_preempt_note_context_switch(int cpu); -static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); -#ifdef CONFIG_HOTPLUG_CPU -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, - unsigned long flags); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_print_detail_task_stall(struct rcu_state *rsp); -static int rcu_print_task_stall(struct rcu_node *rnp); -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -#ifdef CONFIG_HOTPLUG_CPU -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_preempt_check_callbacks(int cpu); -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); -#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake); -#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ -static void __init __rcu_init_preempt(void); -static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); -static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -static void invoke_rcu_callbacks_kthread(void); -static bool rcu_is_callbacks_kthread(void); -#ifdef CONFIG_RCU_BOOST -static void rcu_preempt_do_callbacks(void); -static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp); -#endif /* #ifdef CONFIG_RCU_BOOST */ -static void rcu_prepare_kthreads(int cpu); -static void rcu_cleanup_after_idle(int cpu); -static void rcu_prepare_for_idle(int cpu); -static void rcu_idle_count_callbacks_posted(void); -static void print_cpu_stall_info_begin(void); -static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); -static void print_cpu_stall_info_end(void); -static void zero_cpu_stall_ticks(struct rcu_data *rdp); -static void increment_cpu_stall_ticks(void); -static int rcu_nocb_needs_gp(struct rcu_state *rsp); -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); -static void rcu_init_one_nocb(struct rcu_node *rnp); -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy); -static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp); -static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); -static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); -static void rcu_kick_nohz_cpu(int cpu); -static bool init_nocb_callback_list(struct rcu_data *rdp); -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj); -static bool is_sysidle_rcu_state(struct rcu_state *rsp); -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj); -static void rcu_bind_gp_kthread(void); -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); - -#endif /* #ifndef RCU_TREE_NONCORE */ - -#ifdef CONFIG_RCU_TRACE -#ifdef CONFIG_RCU_NOCB_CPU -/* Sum up queue lengths for tracing. */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ - *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; - *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; -} -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) -{ - *ql = 0; - *qll = 0; -} -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ -#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h deleted file mode 100644 index 8d85a5ce093a..000000000000 --- a/kernel/rcutree_plugin.h +++ /dev/null @@ -1,2831 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion (tree-based version) - * Internal non-public definitions that provide either classic - * or preemptible semantics. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright Red Hat, 2009 - * Copyright IBM Corporation, 2009 - * - * Author: Ingo Molnar - * Paul E. McKenney - */ - -#include -#include -#include -#include -#include "time/tick-internal.h" - -#define RCU_KTHREAD_PRIO 1 - -#ifdef CONFIG_RCU_BOOST -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO -#else -#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO -#endif - -#ifdef CONFIG_RCU_NOCB_CPU -static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ -static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ -static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ -static char __initdata nocb_buf[NR_CPUS * 5]; -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - -/* - * Check the RCU kernel configuration parameters and print informative - * messages about anything out of the ordinary. If you like #ifdef, you - * will love this function. - */ -static void __init rcu_bootup_announce_oddness(void) -{ -#ifdef CONFIG_RCU_TRACE - pr_info("\tRCU debugfs-based tracing is enabled.\n"); -#endif -#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) - pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", - CONFIG_RCU_FANOUT); -#endif -#ifdef CONFIG_RCU_FANOUT_EXACT - pr_info("\tHierarchical RCU autobalancing is disabled.\n"); -#endif -#ifdef CONFIG_RCU_FAST_NO_HZ - pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); -#endif -#ifdef CONFIG_PROVE_RCU - pr_info("\tRCU lockdep checking is enabled.\n"); -#endif -#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE - pr_info("\tRCU torture testing starts during boot.\n"); -#endif -#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) - pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); -#endif -#if defined(CONFIG_RCU_CPU_STALL_INFO) - pr_info("\tAdditional per-CPU info printed with stalls.\n"); -#endif -#if NUM_RCU_LVL_4 != 0 - pr_info("\tFour-level hierarchy is enabled.\n"); -#endif - if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) - pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); - if (nr_cpu_ids != NR_CPUS) - pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); -#ifdef CONFIG_RCU_NOCB_CPU -#ifndef CONFIG_RCU_NOCB_CPU_NONE - if (!have_rcu_nocb_mask) { - zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); - have_rcu_nocb_mask = true; - } -#ifdef CONFIG_RCU_NOCB_CPU_ZERO - pr_info("\tOffload RCU callbacks from CPU 0\n"); - cpumask_set_cpu(0, rcu_nocb_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ -#ifdef CONFIG_RCU_NOCB_CPU_ALL - pr_info("\tOffload RCU callbacks from all CPUs\n"); - cpumask_copy(rcu_nocb_mask, cpu_possible_mask); -#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ -#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ - if (have_rcu_nocb_mask) { - if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { - pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); - cpumask_and(rcu_nocb_mask, cpu_possible_mask, - rcu_nocb_mask); - } - cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); - pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); - if (rcu_nocb_poll) - pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); - } -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ -} - -#ifdef CONFIG_TREE_PREEMPT_RCU - -RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); -static struct rcu_state *rcu_state = &rcu_preempt_state; - -static int rcu_preempted_readers_exp(struct rcu_node *rnp); - -/* - * Tell them what RCU they are running. - */ -static void __init rcu_bootup_announce(void) -{ - pr_info("Preemptible hierarchical RCU implementation.\n"); - rcu_bootup_announce_oddness(); -} - -/* - * Return the number of RCU-preempt batches processed thus far - * for debug and statistics. - */ -long rcu_batches_completed_preempt(void) -{ - return rcu_preempt_state.completed; -} -EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); - -/* - * Return the number of RCU batches processed thus far for debug & stats. - */ -long rcu_batches_completed(void) -{ - return rcu_batches_completed_preempt(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* - * Force a quiescent state for preemptible RCU. - */ -void rcu_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_preempt_state); -} -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); - -/* - * Record a preemptible-RCU quiescent state for the specified CPU. Note - * that this just means that the task currently running on the CPU is - * not in a quiescent state. There might be any number of tasks blocked - * while in an RCU read-side critical section. - * - * Unlike the other rcu_*_qs() functions, callers to this function - * must disable irqs in order to protect the assignment to - * ->rcu_read_unlock_special. - */ -static void rcu_preempt_qs(int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - - if (rdp->passed_quiesce == 0) - trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); - rdp->passed_quiesce = 1; - current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; -} - -/* - * We have entered the scheduler, and the current task might soon be - * context-switched away from. If this task is in an RCU read-side - * critical section, we will no longer be able to rely on the CPU to - * record that fact, so we enqueue the task on the blkd_tasks list. - * The task will dequeue itself when it exits the outermost enclosing - * RCU read-side critical section. Therefore, the current grace period - * cannot be permitted to complete until the blkd_tasks list entries - * predating the current grace period drain, in other words, until - * rnp->gp_tasks becomes NULL. - * - * Caller must disable preemption. - */ -static void rcu_preempt_note_context_switch(int cpu) -{ - struct task_struct *t = current; - unsigned long flags; - struct rcu_data *rdp; - struct rcu_node *rnp; - - if (t->rcu_read_lock_nesting > 0 && - (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { - - /* Possibly blocking in an RCU read-side critical section. */ - rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); - rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; - t->rcu_blocked_node = rnp; - - /* - * If this CPU has already checked in, then this task - * will hold up the next grace period rather than the - * current grace period. Queue the task accordingly. - * If the task is queued for the current grace period - * (i.e., this CPU has not yet passed through a quiescent - * state for the current grace period), then as long - * as that task remains queued, the current grace period - * cannot end. Note that there is some uncertainty as - * to exactly when the current grace period started. - * We take a conservative approach, which can result - * in unnecessarily waiting on tasks that started very - * slightly after the current grace period began. C'est - * la vie!!! - * - * But first, note that the current CPU must still be - * on line! - */ - WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); - WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); - if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { - list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); - rnp->gp_tasks = &t->rcu_node_entry; -#ifdef CONFIG_RCU_BOOST - if (rnp->boost_tasks != NULL) - rnp->boost_tasks = rnp->gp_tasks; -#endif /* #ifdef CONFIG_RCU_BOOST */ - } else { - list_add(&t->rcu_node_entry, &rnp->blkd_tasks); - if (rnp->qsmask & rdp->grpmask) - rnp->gp_tasks = &t->rcu_node_entry; - } - trace_rcu_preempt_task(rdp->rsp->name, - t->pid, - (rnp->qsmask & rdp->grpmask) - ? rnp->gpnum - : rnp->gpnum + 1); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } else if (t->rcu_read_lock_nesting < 0 && - t->rcu_read_unlock_special) { - - /* - * Complete exit from RCU read-side critical section on - * behalf of preempted instance of __rcu_read_unlock(). - */ - rcu_read_unlock_special(t); - } - - /* - * Either we were not in an RCU read-side critical section to - * begin with, or we have now recorded that critical section - * globally. Either way, we can now note a quiescent state - * for this CPU. Again, if we were in an RCU read-side critical - * section, and if that critical section was blocking the current - * grace period, then the fact that the task has been enqueued - * means that we continue to block the current grace period. - */ - local_irq_save(flags); - rcu_preempt_qs(cpu); - local_irq_restore(flags); -} - -/* - * Check for preempted RCU readers blocking the current grace period - * for the specified rcu_node structure. If the caller needs a reliable - * answer, it must hold the rcu_node's ->lock. - */ -static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) -{ - return rnp->gp_tasks != NULL; -} - -/* - * Record a quiescent state for all tasks that were previously queued - * on the specified rcu_node structure and that were blocking the current - * RCU grace period. The caller must hold the specified rnp->lock with - * irqs disabled, and this lock is released upon return, but irqs remain - * disabled. - */ -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) - __releases(rnp->lock) -{ - unsigned long mask; - struct rcu_node *rnp_p; - - if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; /* Still need more quiescent states! */ - } - - rnp_p = rnp->parent; - if (rnp_p == NULL) { - /* - * Either there is only one rcu_node in the tree, - * or tasks were kicked up to root rcu_node due to - * CPUs going offline. - */ - rcu_report_qs_rsp(&rcu_preempt_state, flags); - return; - } - - /* Report up the rest of the hierarchy. */ - mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ - rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); -} - -/* - * Advance a ->blkd_tasks-list pointer to the next entry, instead - * returning NULL if at the end of the list. - */ -static struct list_head *rcu_next_node_entry(struct task_struct *t, - struct rcu_node *rnp) -{ - struct list_head *np; - - np = t->rcu_node_entry.next; - if (np == &rnp->blkd_tasks) - np = NULL; - return np; -} - -/* - * Handle special cases during rcu_read_unlock(), such as needing to - * notify RCU core processing or task having blocked during the RCU - * read-side critical section. - */ -void rcu_read_unlock_special(struct task_struct *t) -{ - int empty; - int empty_exp; - int empty_exp_now; - unsigned long flags; - struct list_head *np; -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rbmp = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ - struct rcu_node *rnp; - int special; - - /* NMI handlers cannot block and cannot safely manipulate state. */ - if (in_nmi()) - return; - - local_irq_save(flags); - - /* - * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. - */ - special = t->rcu_read_unlock_special; - if (special & RCU_READ_UNLOCK_NEED_QS) { - rcu_preempt_qs(smp_processor_id()); - } - - /* Hardware IRQ handlers cannot block. */ - if (in_irq() || in_serving_softirq()) { - local_irq_restore(flags); - return; - } - - /* Clean up if blocked during RCU read-side critical section. */ - if (special & RCU_READ_UNLOCK_BLOCKED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; - - /* - * Remove this task from the list it blocked on. The - * task can migrate while we acquire the lock, but at - * most one time. So at most two passes through loop. - */ - for (;;) { - rnp = t->rcu_blocked_node; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - if (rnp == t->rcu_blocked_node) - break; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } - empty = !rcu_preempt_blocked_readers_cgp(rnp); - empty_exp = !rcu_preempted_readers_exp(rnp); - smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ - np = rcu_next_node_entry(t, rnp); - list_del_init(&t->rcu_node_entry); - t->rcu_blocked_node = NULL; - trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), - rnp->gpnum, t->pid); - if (&t->rcu_node_entry == rnp->gp_tasks) - rnp->gp_tasks = np; - if (&t->rcu_node_entry == rnp->exp_tasks) - rnp->exp_tasks = np; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; - /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ - if (t->rcu_boost_mutex) { - rbmp = t->rcu_boost_mutex; - t->rcu_boost_mutex = NULL; - } -#endif /* #ifdef CONFIG_RCU_BOOST */ - - /* - * If this was the last task on the current list, and if - * we aren't waiting on any CPUs, report the quiescent state. - * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, - * so we must take a snapshot of the expedited state. - */ - empty_exp_now = !rcu_preempted_readers_exp(rnp); - if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { - trace_rcu_quiescent_state_report(TPS("preempt_rcu"), - rnp->gpnum, - 0, rnp->qsmask, - rnp->level, - rnp->grplo, - rnp->grphi, - !!rnp->gp_tasks); - rcu_report_unblock_qs_rnp(rnp, flags); - } else { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } - -#ifdef CONFIG_RCU_BOOST - /* Unboost if we were boosted. */ - if (rbmp) - rt_mutex_unlock(rbmp); -#endif /* #ifdef CONFIG_RCU_BOOST */ - - /* - * If this was the last task on the expedited lists, - * then we need to report up the rcu_node hierarchy. - */ - if (!empty_exp && empty_exp_now) - rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); - } else { - local_irq_restore(flags); - } -} - -#ifdef CONFIG_RCU_CPU_STALL_VERBOSE - -/* - * Dump detailed information for all tasks blocking the current RCU - * grace period on the specified rcu_node structure. - */ -static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) -{ - unsigned long flags; - struct task_struct *t; - - raw_spin_lock_irqsave(&rnp->lock, flags); - if (!rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - t = list_entry(rnp->gp_tasks, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) - sched_show_task(t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Dump detailed information for all tasks blocking the current RCU - * grace period. - */ -static void rcu_print_detail_task_stall(struct rcu_state *rsp) -{ - struct rcu_node *rnp = rcu_get_root(rsp); - - rcu_print_detail_task_stall_rnp(rnp); - rcu_for_each_leaf_node(rsp, rnp) - rcu_print_detail_task_stall_rnp(rnp); -} - -#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ - -static void rcu_print_detail_task_stall(struct rcu_state *rsp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ - -#ifdef CONFIG_RCU_CPU_STALL_INFO - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ - pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", - rnp->level, rnp->grplo, rnp->grphi); -} - -static void rcu_print_task_stall_end(void) -{ - pr_cont("\n"); -} - -#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ -} - -static void rcu_print_task_stall_end(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ - struct task_struct *t; - int ndetected = 0; - - if (!rcu_preempt_blocked_readers_cgp(rnp)) - return 0; - rcu_print_task_stall_begin(rnp); - t = list_entry(rnp->gp_tasks, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); - ndetected++; - } - rcu_print_task_stall_end(); - return ndetected; -} - -/* - * Check that the list of blocked tasks for the newly completed grace - * period is in fact empty. It is a serious bug to complete a grace - * period that still has RCU readers blocked! This function must be - * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock - * must be held by the caller. - * - * Also, if there are blocked tasks on the list, they automatically - * block the newly created grace period, so set up ->gp_tasks accordingly. - */ -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) -{ - WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (!list_empty(&rnp->blkd_tasks)) - rnp->gp_tasks = rnp->blkd_tasks.next; - WARN_ON_ONCE(rnp->qsmask); -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Handle tasklist migration for case in which all CPUs covered by the - * specified rcu_node have gone offline. Move them up to the root - * rcu_node. The reason for not just moving them to the immediate - * parent is to remove the need for rcu_read_unlock_special() to - * make more than two attempts to acquire the target rcu_node's lock. - * Returns true if there were tasks blocking the current RCU grace - * period. - * - * Returns 1 if there was previously a task blocking the current grace - * period on the specified rcu_node structure. - * - * The caller must hold rnp->lock with irqs disabled. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - struct list_head *lp; - struct list_head *lp_root; - int retval = 0; - struct rcu_node *rnp_root = rcu_get_root(rsp); - struct task_struct *t; - - if (rnp == rnp_root) { - WARN_ONCE(1, "Last CPU thought to be offlined?"); - return 0; /* Shouldn't happen: at least one CPU online. */ - } - - /* If we are on an internal node, complain bitterly. */ - WARN_ON_ONCE(rnp != rdp->mynode); - - /* - * Move tasks up to root rcu_node. Don't try to get fancy for - * this corner-case operation -- just put this node's tasks - * at the head of the root node's list, and update the root node's - * ->gp_tasks and ->exp_tasks pointers to those of this node's, - * if non-NULL. This might result in waiting for more tasks than - * absolutely necessary, but this is a good performance/complexity - * tradeoff. - */ - if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) - retval |= RCU_OFL_TASKS_NORM_GP; - if (rcu_preempted_readers_exp(rnp)) - retval |= RCU_OFL_TASKS_EXP_GP; - lp = &rnp->blkd_tasks; - lp_root = &rnp_root->blkd_tasks; - while (!list_empty(lp)) { - t = list_entry(lp->next, typeof(*t), rcu_node_entry); - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - list_del(&t->rcu_node_entry); - t->rcu_blocked_node = rnp_root; - list_add(&t->rcu_node_entry, lp_root); - if (&t->rcu_node_entry == rnp->gp_tasks) - rnp_root->gp_tasks = rnp->gp_tasks; - if (&t->rcu_node_entry == rnp->exp_tasks) - rnp_root->exp_tasks = rnp->exp_tasks; -#ifdef CONFIG_RCU_BOOST - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp_root->boost_tasks = rnp->boost_tasks; -#endif /* #ifdef CONFIG_RCU_BOOST */ - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ - } - - rnp->gp_tasks = NULL; - rnp->exp_tasks = NULL; -#ifdef CONFIG_RCU_BOOST - rnp->boost_tasks = NULL; - /* - * In case root is being boosted and leaf was not. Make sure - * that we boost the tasks blocking the current grace period - * in this case. - */ - raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ - if (rnp_root->boost_tasks != NULL && - rnp_root->boost_tasks != rnp_root->gp_tasks && - rnp_root->boost_tasks != rnp_root->exp_tasks) - rnp_root->boost_tasks = rnp_root->gp_tasks; - raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ -#endif /* #ifdef CONFIG_RCU_BOOST */ - - return retval; -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -/* - * Check for a quiescent state from the current CPU. When a task blocks, - * the task is recorded in the corresponding CPU's rcu_node structure, - * which is checked elsewhere. - * - * Caller must disable hard irqs. - */ -static void rcu_preempt_check_callbacks(int cpu) -{ - struct task_struct *t = current; - - if (t->rcu_read_lock_nesting == 0) { - rcu_preempt_qs(cpu); - return; - } - if (t->rcu_read_lock_nesting > 0 && - per_cpu(rcu_preempt_data, cpu).qs_pending) - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; -} - -#ifdef CONFIG_RCU_BOOST - -static void rcu_preempt_do_callbacks(void) -{ - rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -/* - * Queue a preemptible-RCU callback for invocation after a grace period. - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_preempt_state, -1, 0); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). - */ -void kfree_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_preempt_state, -1, 1); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu); - -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. Note, however, that - * upon return from synchronize_rcu(), the caller might well be executing - * concurrently with new RCU read-side critical sections that began while - * synchronize_rcu() was waiting. RCU read-side critical sections are - * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. - * - * See the description of synchronize_sched() for more detailed information - * on memory ordering guarantees. - */ -void synchronize_rcu(void) -{ - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - if (!rcu_scheduler_active) - return; - if (rcu_expedited) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - -static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static unsigned long sync_rcu_preempt_exp_count; -static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); - -/* - * Return non-zero if there are any tasks in RCU read-side critical - * sections blocking the current preemptible-RCU expedited grace period. - * If there is no preemptible-RCU expedited grace period currently in - * progress, returns zero unconditionally. - */ -static int rcu_preempted_readers_exp(struct rcu_node *rnp) -{ - return rnp->exp_tasks != NULL; -} - -/* - * return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold sync_rcu_preempt_exp_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ - return !rcu_preempted_readers_exp(rnp) && - ACCESS_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree. (Calm down, calm down, we do the recursion - * iteratively!) - * - * Most callers will set the "wake" flag, but the task initiating the - * expedited grace period need not wake itself. - * - * Caller must hold sync_rcu_preempt_exp_mutex. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ - unsigned long flags; - unsigned long mask; - - raw_spin_lock_irqsave(&rnp->lock, flags); - for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - break; - } - if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (wake) - wake_up(&sync_rcu_preempt_exp_wq); - break; - } - mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - rnp = rnp->parent; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - rnp->expmask &= ~mask; - } -} - -/* - * Snapshot the tasks blocking the newly started preemptible-RCU expedited - * grace period for the specified rcu_node structure. If there are no such - * tasks, report it up the rcu_node hierarchy. - * - * Caller must hold sync_rcu_preempt_exp_mutex and must exclude - * CPU hotplug operations. - */ -static void -sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) -{ - unsigned long flags; - int must_wait = 0; - - raw_spin_lock_irqsave(&rnp->lock, flags); - if (list_empty(&rnp->blkd_tasks)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } else { - rnp->exp_tasks = rnp->blkd_tasks.next; - rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ - must_wait = 1; - } - if (!must_wait) - rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ -} - -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blkd_tasks lists and wait for this list to drain. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. - * In fact, if you are using synchronize_rcu_expedited() in a loop, - * please restructure your code to batch your updates, and then Use a - * single synchronize_rcu() instead. - * - * Note that it is illegal to call this function while holding any lock - * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal - * to call this function from a CPU-hotplug notifier. Failing to observe - * these restriction will result in deadlock. - */ -void synchronize_rcu_expedited(void) -{ - unsigned long flags; - struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_preempt_state; - unsigned long snap; - int trycount = 0; - - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; - smp_mb(); /* Above access cannot bleed into critical section. */ - - /* - * Block CPU-hotplug operations. This means that any CPU-hotplug - * operation that finds an rcu_node structure with tasks in the - * process of being boosted will know that all tasks blocking - * this expedited grace period will already be in the process of - * being boosted. This simplifies the process of moving tasks - * from leaf to root rcu_node structures. - */ - get_online_cpus(); - - /* - * Acquire lock, falling back to synchronize_rcu() if too many - * lock-acquisition failures. Of course, if someone does the - * expedited grace period for us, just leave. - */ - while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (ULONG_CMP_LT(snap, - ACCESS_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); - goto mb_ret; /* Others did our work for us. */ - } - if (trycount++ < 10) { - udelay(trycount * num_online_cpus()); - } else { - put_online_cpus(); - wait_rcu_gp(call_rcu); - return; - } - } - if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); - goto unlock_mb_ret; /* Others did our work for us. */ - } - - /* force all RCU readers onto ->blkd_tasks lists. */ - synchronize_sched_expedited(); - - /* Initialize ->expmask for all non-leaf rcu_node structures. */ - rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->expmask = rnp->qsmaskinit; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } - - /* Snapshot current state of ->blkd_tasks lists. */ - rcu_for_each_leaf_node(rsp, rnp) - sync_rcu_preempt_exp_init(rsp, rnp); - if (NUM_RCU_NODES > 1) - sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); - - put_online_cpus(); - - /* Wait for snapshotted ->blkd_tasks lists to drain. */ - rnp = rcu_get_root(rsp); - wait_event(sync_rcu_preempt_exp_wq, - sync_rcu_preempt_exp_done(rnp)); - - /* Clean up and exit. */ - smp_mb(); /* ensure expedited GP seen before counter increment. */ - ACCESS_ONCE(sync_rcu_preempt_exp_count)++; -unlock_mb_ret: - mutex_unlock(&sync_rcu_preempt_exp_mutex); -mb_ret: - smp_mb(); /* ensure subsequent action seen after grace period. */ -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -/** - * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. - * - * Note that this primitive does not necessarily wait for an RCU grace period - * to complete. For example, if there are no RCU callbacks queued anywhere - * in the system, then rcu_barrier() is within its rights to return - * immediately, without waiting for anything, much less an RCU grace period. - */ -void rcu_barrier(void) -{ - _rcu_barrier(&rcu_preempt_state); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - -/* - * Initialize preemptible RCU's state structures. - */ -static void __init __rcu_init_preempt(void) -{ - rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); -} - -/* - * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. - */ -void exit_rcu(void) -{ - struct task_struct *t = current; - - if (likely(list_empty(¤t->rcu_node_entry))) - return; - t->rcu_read_lock_nesting = 1; - barrier(); - t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; - __rcu_read_unlock(); -} - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - -static struct rcu_state *rcu_state = &rcu_sched_state; - -/* - * Tell them what RCU they are running. - */ -static void __init rcu_bootup_announce(void) -{ - pr_info("Hierarchical RCU implementation.\n"); - rcu_bootup_announce_oddness(); -} - -/* - * Return the number of RCU batches processed thus far for debug & stats. - */ -long rcu_batches_completed(void) -{ - return rcu_batches_completed_sched(); -} -EXPORT_SYMBOL_GPL(rcu_batches_completed); - -/* - * Force a quiescent state for RCU, which, because there is no preemptible - * RCU, becomes the same as rcu-sched. - */ -void rcu_force_quiescent_state(void) -{ - rcu_sched_force_quiescent_state(); -} -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); - -/* - * Because preemptible RCU does not exist, we never have to check for - * CPUs being in quiescent states. - */ -static void rcu_preempt_note_context_switch(int cpu) -{ -} - -/* - * Because preemptible RCU does not exist, there are never any preempted - * RCU readers. - */ -static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) -{ - return 0; -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* Because preemptible RCU does not exist, no quieting of tasks. */ -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) -{ - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static void rcu_print_detail_task_stall(struct rcu_state *rsp) -{ -} - -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ - return 0; -} - -/* - * Because there is no preemptible RCU, there can be no readers blocked, - * so there is no need to check for blocked tasks. So check only for - * bogus qsmask values. - */ -static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) -{ - WARN_ON_ONCE(rnp->qsmask); -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections, and - * such non-existent tasks cannot possibly have been blocking the current - * grace period. - */ -static int rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) -{ - return 0; -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -/* - * Because preemptible RCU does not exist, it never has any callbacks - * to check. - */ -static void rcu_preempt_check_callbacks(int cpu) -{ -} - -/* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). - * - * Because there is no preemptible RCU, we use RCU-sched instead. - */ -void kfree_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) -{ - __call_rcu(head, func, &rcu_sched_state, -1, 1); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu); - -/* - * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptible RCU does not exist, map to rcu-sched. - */ -void synchronize_rcu_expedited(void) -{ - synchronize_sched_expedited(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -#ifdef CONFIG_HOTPLUG_CPU - -/* - * Because preemptible RCU does not exist, there is never any need to - * report on tasks preempted in RCU read-side critical sections during - * expedited RCU grace periods. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ -} - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -/* - * Because preemptible RCU does not exist, rcu_barrier() is just - * another name for rcu_barrier_sched(). - */ -void rcu_barrier(void) -{ - rcu_barrier_sched(); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - -/* - * Because preemptible RCU does not exist, it need not be initialized. - */ -static void __init __rcu_init_preempt(void) -{ -} - -/* - * Because preemptible RCU does not exist, tasks cannot possibly exit - * while in preemptible RCU read-side critical sections. - */ -void exit_rcu(void) -{ -} - -#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ - -#ifdef CONFIG_RCU_BOOST - -#include "rtmutex_common.h" - -#ifdef CONFIG_RCU_TRACE - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ - if (list_empty(&rnp->blkd_tasks)) - rnp->n_balk_blkd_tasks++; - else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) - rnp->n_balk_exp_gp_tasks++; - else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) - rnp->n_balk_boost_tasks++; - else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) - rnp->n_balk_notblocked++; - else if (rnp->gp_tasks != NULL && - ULONG_CMP_LT(jiffies, rnp->boost_time)) - rnp->n_balk_notyet++; - else - rnp->n_balk_nos++; -} - -#else /* #ifdef CONFIG_RCU_TRACE */ - -static void rcu_initiate_boost_trace(struct rcu_node *rnp) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - -static void rcu_wake_cond(struct task_struct *t, int status) -{ - /* - * If the thread is yielding, only wake it when this - * is invoked from idle - */ - if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) - wake_up_process(t); -} - -/* - * Carry out RCU priority boosting on the task indicated by ->exp_tasks - * or ->boost_tasks, advancing the pointer to the next task in the - * ->blkd_tasks list. - * - * Note that irqs must be enabled: boosting the task can block. - * Returns 1 if there are more tasks needing to be boosted. - */ -static int rcu_boost(struct rcu_node *rnp) -{ - unsigned long flags; - struct rt_mutex mtx; - struct task_struct *t; - struct list_head *tb; - - if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) - return 0; /* Nothing left to boost. */ - - raw_spin_lock_irqsave(&rnp->lock, flags); - - /* - * Recheck under the lock: all tasks in need of boosting - * might exit their RCU read-side critical sections on their own. - */ - if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return 0; - } - - /* - * Preferentially boost tasks blocking expedited grace periods. - * This cannot starve the normal grace periods because a second - * expedited grace period must boost all blocked tasks, including - * those blocking the pre-existing normal grace period. - */ - if (rnp->exp_tasks != NULL) { - tb = rnp->exp_tasks; - rnp->n_exp_boosts++; - } else { - tb = rnp->boost_tasks; - rnp->n_normal_boosts++; - } - rnp->n_tasks_boosted++; - - /* - * We boost task t by manufacturing an rt_mutex that appears to - * be held by task t. We leave a pointer to that rt_mutex where - * task t can find it, and task t will release the mutex when it - * exits its outermost RCU read-side critical section. Then - * simply acquiring this artificial rt_mutex will boost task - * t's priority. (Thanks to tglx for suggesting this approach!) - * - * Note that task t must acquire rnp->lock to remove itself from - * the ->blkd_tasks list, which it will do from exit() if from - * nowhere else. We therefore are guaranteed that task t will - * stay around at least until we drop rnp->lock. Note that - * rnp->lock also resolves races between our priority boosting - * and task t's exiting its outermost RCU read-side critical - * section. - */ - t = container_of(tb, struct task_struct, rcu_node_entry); - rt_mutex_init_proxy_locked(&mtx, t); - t->rcu_boost_mutex = &mtx; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ - rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ - - return ACCESS_ONCE(rnp->exp_tasks) != NULL || - ACCESS_ONCE(rnp->boost_tasks) != NULL; -} - -/* - * Priority-boosting kthread. One per leaf rcu_node and one for the - * root rcu_node. - */ -static int rcu_boost_kthread(void *arg) -{ - struct rcu_node *rnp = (struct rcu_node *)arg; - int spincnt = 0; - int more2boost; - - trace_rcu_utilization(TPS("Start boost kthread@init")); - for (;;) { - rnp->boost_kthread_status = RCU_KTHREAD_WAITING; - trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || rnp->exp_tasks); - trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); - rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; - more2boost = rcu_boost(rnp); - if (more2boost) - spincnt++; - else - spincnt = 0; - if (spincnt > 10) { - rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; - trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); - schedule_timeout_interruptible(2); - trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); - spincnt = 0; - } - } - /* NOTREACHED */ - trace_rcu_utilization(TPS("End boost kthread@notreached")); - return 0; -} - -/* - * Check to see if it is time to start boosting RCU readers that are - * blocking the current grace period, and, if so, tell the per-rcu_node - * kthread to start boosting them. If there is an expedited grace - * period in progress, it is always time to boost. - * - * The caller must hold rnp->lock, which this function releases. - * The ->boost_kthread_task is immortal, so we don't need to worry - * about it going away. - */ -static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) -{ - struct task_struct *t; - - if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { - rnp->n_balk_exp_gp_tasks++; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - if (rnp->exp_tasks != NULL || - (rnp->gp_tasks != NULL && - rnp->boost_tasks == NULL && - rnp->qsmask == 0 && - ULONG_CMP_GE(jiffies, rnp->boost_time))) { - if (rnp->exp_tasks == NULL) - rnp->boost_tasks = rnp->gp_tasks; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - t = rnp->boost_kthread_task; - if (t) - rcu_wake_cond(t, rnp->boost_kthread_status); - } else { - rcu_initiate_boost_trace(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } -} - -/* - * Wake up the per-CPU kthread to invoke RCU callbacks. - */ -static void invoke_rcu_callbacks_kthread(void) -{ - unsigned long flags; - - local_irq_save(flags); - __this_cpu_write(rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_cpu_kthread_task)) { - rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), - __this_cpu_read(rcu_cpu_kthread_status)); - } - local_irq_restore(flags); -} - -/* - * Is the current CPU running the RCU-callbacks kthread? - * Caller must have preemption disabled. - */ -static bool rcu_is_callbacks_kthread(void) -{ - return __this_cpu_read(rcu_cpu_kthread_task) == current; -} - -#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) - -/* - * Do priority-boost accounting for the start of a new grace period. - */ -static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) -{ - rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; -} - -/* - * Create an RCU-boost kthread for the specified node if one does not - * already exist. We only create this kthread for preemptible RCU. - * Returns zero if all is well, a negated errno otherwise. - */ -static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) -{ - int rnp_index = rnp - &rsp->node[0]; - unsigned long flags; - struct sched_param sp; - struct task_struct *t; - - if (&rcu_preempt_state != rsp) - return 0; - - if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) - return 0; - - rsp->boost = 1; - if (rnp->boost_kthread_task != NULL) - return 0; - t = kthread_create(rcu_boost_kthread, (void *)rnp, - "rcub/%d", rnp_index); - if (IS_ERR(t)) - return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->boost_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = RCU_BOOST_PRIO; - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); - wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - return 0; -} - -static void rcu_kthread_do_work(void) -{ - rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); - rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); - rcu_preempt_do_callbacks(); -} - -static void rcu_cpu_kthread_setup(unsigned int cpu) -{ - struct sched_param sp; - - sp.sched_priority = RCU_KTHREAD_PRIO; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); -} - -static void rcu_cpu_kthread_park(unsigned int cpu) -{ - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; -} - -static int rcu_cpu_kthread_should_run(unsigned int cpu) -{ - return __this_cpu_read(rcu_cpu_has_work); -} - -/* - * Per-CPU kernel thread that invokes RCU callbacks. This replaces the - * RCU softirq used in flavors and configurations of RCU that do not - * support RCU priority boosting. - */ -static void rcu_cpu_kthread(unsigned int cpu) -{ - unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); - char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); - int spincnt; - - for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); - local_bh_disable(); - *statusp = RCU_KTHREAD_RUNNING; - this_cpu_inc(rcu_cpu_kthread_loops); - local_irq_disable(); - work = *workp; - *workp = 0; - local_irq_enable(); - if (work) - rcu_kthread_do_work(); - local_bh_enable(); - if (*workp == 0) { - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); - *statusp = RCU_KTHREAD_WAITING; - return; - } - } - *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); - *statusp = RCU_KTHREAD_WAITING; -} - -/* - * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. The CPU hotplug lock is still - * held, so the value of rnp->qsmaskinit will be stable. - * - * We don't include outgoingcpu in the affinity set, use -1 if there is - * no outgoing CPU. If there are no CPUs left in the affinity set, - * this function allows the kthread to execute on any CPU. - */ -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ - struct task_struct *t = rnp->boost_kthread_task; - unsigned long mask = rnp->qsmaskinit; - cpumask_var_t cm; - int cpu; - - if (!t) - return; - if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) - return; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) - if ((mask & 0x1) && cpu != outgoingcpu) - cpumask_set_cpu(cpu, cm); - if (cpumask_weight(cm) == 0) { - cpumask_setall(cm); - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) - cpumask_clear_cpu(cpu, cm); - WARN_ON_ONCE(cpumask_weight(cm) == 0); - } - set_cpus_allowed_ptr(t, cm); - free_cpumask_var(cm); -} - -static struct smp_hotplug_thread rcu_cpu_thread_spec = { - .store = &rcu_cpu_kthread_task, - .thread_should_run = rcu_cpu_kthread_should_run, - .thread_fn = rcu_cpu_kthread, - .thread_comm = "rcuc/%u", - .setup = rcu_cpu_kthread_setup, - .park = rcu_cpu_kthread_park, -}; - -/* - * Spawn all kthreads -- called as soon as the scheduler is running. - */ -static int __init rcu_spawn_kthreads(void) -{ - struct rcu_node *rnp; - int cpu; - - rcu_scheduler_fully_active = 1; - for_each_possible_cpu(cpu) - per_cpu(rcu_cpu_has_work, cpu) = 0; - BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); - rnp = rcu_get_root(rcu_state); - (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); - if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state, rnp) - (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); - } - return 0; -} -early_initcall(rcu_spawn_kthreads); - -static void rcu_prepare_kthreads(int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); - struct rcu_node *rnp = rdp->mynode; - - /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ - if (rcu_scheduler_fully_active) - (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); -} - -#else /* #ifdef CONFIG_RCU_BOOST */ - -static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) -{ - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -static void invoke_rcu_callbacks_kthread(void) -{ - WARN_ON_ONCE(1); -} - -static bool rcu_is_callbacks_kthread(void) -{ - return false; -} - -static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) -{ -} - -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) -{ -} - -static int __init rcu_scheduler_really_started(void) -{ - rcu_scheduler_fully_active = 1; - return 0; -} -early_initcall(rcu_scheduler_really_started); - -static void rcu_prepare_kthreads(int cpu) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - -#if !defined(CONFIG_RCU_FAST_NO_HZ) - -/* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. - * - * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs - * any flavor of RCU. - */ -int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) -{ - *delta_jiffies = ULONG_MAX; - return rcu_cpu_has_callbacks(cpu, NULL); -} - -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up - * after it. - */ -static void rcu_cleanup_after_idle(int cpu) -{ -} - -/* - * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, - * is nothing. - */ -static void rcu_prepare_for_idle(int cpu) -{ -} - -/* - * Don't bother keeping a running count of the number of RCU callbacks - * posted because CONFIG_RCU_FAST_NO_HZ=n. - */ -static void rcu_idle_count_callbacks_posted(void) -{ -} - -#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ - -/* - * This code is invoked when a CPU goes idle, at which point we want - * to have the CPU do everything required for RCU so that it can enter - * the energy-efficient dyntick-idle mode. This is handled by a - * state machine implemented by rcu_prepare_for_idle() below. - * - * The following three proprocessor symbols control this state machine: - * - * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted - * to sleep in dyntick-idle mode with RCU callbacks pending. This - * is sized to be roughly one RCU grace period. Those energy-efficiency - * benchmarkers who might otherwise be tempted to set this to a large - * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your - * system. And if you are -that- concerned about energy efficiency, - * just power the system down and be done with it! - * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is - * permitted to sleep in dyntick-idle mode with only lazy RCU - * callbacks pending. Setting this too high can OOM your system. - * - * The values below work well in practice. If future workloads require - * adjustment, they can be converted into kernel config parameters, though - * making the state machine smarter might be a better option. - */ -#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ -#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ - -static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; -module_param(rcu_idle_gp_delay, int, 0644); -static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; -module_param(rcu_idle_lazy_gp_delay, int, 0644); - -extern int tick_nohz_enabled; - -/* - * Try to advance callbacks for all flavors of RCU on the current CPU, but - * only if it has been awhile since the last time we did so. Afterwards, - * if there are any callbacks ready for immediate invocation, return true. - */ -static bool rcu_try_advance_all_cbs(void) -{ - bool cbs_ready = false; - struct rcu_data *rdp; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - struct rcu_node *rnp; - struct rcu_state *rsp; - - /* Exit early if we advanced recently. */ - if (jiffies == rdtp->last_advance_all) - return 0; - rdtp->last_advance_all = jiffies; - - for_each_rcu_flavor(rsp) { - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; - - /* - * Don't bother checking unless a grace period has - * completed since we last checked and there are - * callbacks not yet ready to invoke. - */ - if (rdp->completed != rnp->completed && - rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) - note_gp_changes(rsp, rdp); - - if (cpu_has_callbacks_ready_to_invoke(rdp)) - cbs_ready = true; - } - return cbs_ready; -} - -/* - * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready - * to invoke. If the CPU has callbacks, try to advance them. Tell the - * caller to set the timeout based on whether or not there are non-lazy - * callbacks. - * - * The caller must have disabled interrupts. - */ -int rcu_needs_cpu(int cpu, unsigned long *dj) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - /* Snapshot to detect later posting of non-lazy callback. */ - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; - - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { - *dj = ULONG_MAX; - return 0; - } - - /* Attempt to advance callbacks. */ - if (rcu_try_advance_all_cbs()) { - /* Some ready to invoke, so initiate later invocation. */ - invoke_rcu_core(); - return 1; - } - rdtp->last_accelerate = jiffies; - - /* Request timer delay depending on laziness, and round. */ - if (!rdtp->all_lazy) { - *dj = round_up(rcu_idle_gp_delay + jiffies, - rcu_idle_gp_delay) - jiffies; - } else { - *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; - } - return 0; -} - -/* - * Prepare a CPU for idle from an RCU perspective. The first major task - * is to sense whether nohz mode has been enabled or disabled via sysfs. - * The second major task is to check to see if a non-lazy callback has - * arrived at a CPU that previously had only lazy callbacks. The third - * major task is to accelerate (that is, assign grace-period numbers to) - * any recently arrived callbacks. - * - * The caller must have disabled interrupts. - */ -static void rcu_prepare_for_idle(int cpu) -{ - struct rcu_data *rdp; - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - struct rcu_node *rnp; - struct rcu_state *rsp; - int tne; - - /* Handle nohz enablement switches conservatively. */ - tne = ACCESS_ONCE(tick_nohz_enabled); - if (tne != rdtp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(cpu, NULL)) - invoke_rcu_core(); /* force nohz to see update. */ - rdtp->tick_nohz_enabled_snap = tne; - return; - } - if (!tne) - return; - - /* If this is a no-CBs CPU, no callbacks, just return. */ - if (rcu_is_nocb_cpu(cpu)) - return; - - /* - * If a non-lazy callback arrived at a CPU having only lazy - * callbacks, invoke RCU core for the side-effect of recalculating - * idle duration on re-entry to idle. - */ - if (rdtp->all_lazy && - rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { - rdtp->all_lazy = false; - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; - invoke_rcu_core(); - return; - } - - /* - * If we have not yet accelerated this jiffy, accelerate all - * callbacks on this CPU. - */ - if (rdtp->last_accelerate == jiffies) - return; - rdtp->last_accelerate = jiffies; - for_each_rcu_flavor(rsp) { - rdp = per_cpu_ptr(rsp->rda, cpu); - if (!*rdp->nxttail[RCU_DONE_TAIL]) - continue; - rnp = rdp->mynode; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } -} - -/* - * Clean up for exit from idle. Attempt to advance callbacks based on - * any grace periods that elapsed while the CPU was idle, and if any - * callbacks are now ready to invoke, initiate invocation. - */ -static void rcu_cleanup_after_idle(int cpu) -{ - - if (rcu_is_nocb_cpu(cpu)) - return; - if (rcu_try_advance_all_cbs()) - invoke_rcu_core(); -} - -/* - * Keep a running count of the number of non-lazy callbacks posted - * on this CPU. This running counter (which is never decremented) allows - * rcu_prepare_for_idle() to detect when something out of the idle loop - * posts a callback, even if an equal number of callbacks are invoked. - * Of course, callbacks should only be posted from within a trace event - * designed to be called from idle or from within RCU_NONIDLE(). - */ -static void rcu_idle_count_callbacks_posted(void) -{ - __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); -} - -/* - * Data for flushing lazy RCU callbacks at OOM time. - */ -static atomic_t oom_callback_count; -static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); - -/* - * RCU OOM callback -- decrement the outstanding count and deliver the - * wake-up if we are the last one. - */ -static void rcu_oom_callback(struct rcu_head *rhp) -{ - if (atomic_dec_and_test(&oom_callback_count)) - wake_up(&oom_callback_wq); -} - -/* - * Post an rcu_oom_notify callback on the current CPU if it has at - * least one lazy callback. This will unnecessarily post callbacks - * to CPUs that already have a non-lazy callback at the end of their - * callback list, but this is an infrequent operation, so accept some - * extra overhead to keep things simple. - */ -static void rcu_oom_notify_cpu(void *unused) -{ - struct rcu_state *rsp; - struct rcu_data *rdp; - - for_each_rcu_flavor(rsp) { - rdp = __this_cpu_ptr(rsp->rda); - if (rdp->qlen_lazy != 0) { - atomic_inc(&oom_callback_count); - rsp->call(&rdp->oom_head, rcu_oom_callback); - } - } -} - -/* - * If low on memory, ensure that each CPU has a non-lazy callback. - * This will wake up CPUs that have only lazy callbacks, in turn - * ensuring that they free up the corresponding memory in a timely manner. - * Because an uncertain amount of memory will be freed in some uncertain - * timeframe, we do not claim to have freed anything. - */ -static int rcu_oom_notify(struct notifier_block *self, - unsigned long notused, void *nfreed) -{ - int cpu; - - /* Wait for callbacks from earlier instance to complete. */ - wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); - - /* - * Prevent premature wakeup: ensure that all increments happen - * before there is a chance of the counter reaching zero. - */ - atomic_set(&oom_callback_count, 1); - - get_online_cpus(); - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); - cond_resched(); - } - put_online_cpus(); - - /* Unconditionally decrement: no need to wake ourselves up. */ - atomic_dec(&oom_callback_count); - - return NOTIFY_OK; -} - -static struct notifier_block rcu_oom_nb = { - .notifier_call = rcu_oom_notify -}; - -static int __init rcu_register_oom_notifier(void) -{ - register_oom_notifier(&rcu_oom_nb); - return 0; -} -early_initcall(rcu_register_oom_notifier); - -#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ - -#ifdef CONFIG_RCU_CPU_STALL_INFO - -#ifdef CONFIG_RCU_FAST_NO_HZ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; - - sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", - rdtp->last_accelerate & 0xffff, jiffies & 0xffff, - ulong2long(nlpd), - rdtp->all_lazy ? 'L' : '.', - rdtp->tick_nohz_enabled_snap ? '.' : 'D'); -} - -#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - *cp = '\0'; -} - -#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ - -/* Initiate the stall-info list. */ -static void print_cpu_stall_info_begin(void) -{ - pr_cont("\n"); -} - -/* - * Print out diagnostic information for the specified stalled CPU. - * - * If the specified CPU is aware of the current RCU grace period - * (flavor specified by rsp), then print the number of scheduling - * clock interrupts the CPU has taken during the time that it has - * been aware. Otherwise, print the number of RCU grace periods - * that this CPU is ignorant of, for example, "1" if the CPU was - * aware of the previous grace period. - * - * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. - */ -static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) -{ - char fast_no_hz[72]; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = rdp->dynticks; - char *ticks_title; - unsigned long ticks_value; - - if (rsp->gpnum == rdp->gpnum) { - ticks_title = "ticks this GP"; - ticks_value = rdp->ticks_this_gp; - } else { - ticks_title = "GPs behind"; - ticks_value = rsp->gpnum - rdp->gpnum; - } - print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", - cpu, ticks_value, ticks_title, - atomic_read(&rdtp->dynticks) & 0xfff, - rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, - rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - fast_no_hz); -} - -/* Terminate the stall-info list. */ -static void print_cpu_stall_info_end(void) -{ - pr_err("\t"); -} - -/* Zero ->ticks_this_gp for all flavors of RCU. */ -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ - rdp->ticks_this_gp = 0; - rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); -} - -/* Increment ->ticks_this_gp for all flavors of RCU. */ -static void increment_cpu_stall_ticks(void) -{ - struct rcu_state *rsp; - - for_each_rcu_flavor(rsp) - __this_cpu_ptr(rsp->rda)->ticks_this_gp++; -} - -#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -static void print_cpu_stall_info_begin(void) -{ - pr_cont(" {"); -} - -static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) -{ - pr_cont(" %d", cpu); -} - -static void print_cpu_stall_info_end(void) -{ - pr_cont("} "); -} - -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ -} - -static void increment_cpu_stall_ticks(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -#ifdef CONFIG_RCU_NOCB_CPU - -/* - * Offload callback processing from the boot-time-specified set of CPUs - * specified by rcu_nocb_mask. For each CPU in the set, there is a - * kthread created that pulls the callbacks from the corresponding CPU, - * waits for a grace period to elapse, and invokes the callbacks. - * The no-CBs CPUs do a wake_up() on their kthread when they insert - * a callback into any empty list, unless the rcu_nocb_poll boot parameter - * has been specified, in which case each kthread actively polls its - * CPU. (Which isn't so great for energy efficiency, but which does - * reduce RCU's overhead on that CPU.) - * - * This is intended to be used in conjunction with Frederic Weisbecker's - * adaptive-idle work, which would seriously reduce OS jitter on CPUs - * running CPU-bound user-mode computations. - * - * Offloading of callback processing could also in theory be used as - * an energy-efficiency measure because CPUs with no RCU callbacks - * queued are more aggressive about entering dyntick-idle mode. - */ - - -/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ -static int __init rcu_nocb_setup(char *str) -{ - alloc_bootmem_cpumask_var(&rcu_nocb_mask); - have_rcu_nocb_mask = true; - cpulist_parse(str, rcu_nocb_mask); - return 1; -} -__setup("rcu_nocbs=", rcu_nocb_setup); - -static int __init parse_rcu_nocb_poll(char *arg) -{ - rcu_nocb_poll = 1; - return 0; -} -early_param("rcu_nocb_poll", parse_rcu_nocb_poll); - -/* - * Do any no-CBs CPUs need another grace period? - * - * Interrupts must be disabled. If the caller does not hold the root - * rnp_node structure's ->lock, the results are advisory only. - */ -static int rcu_nocb_needs_gp(struct rcu_state *rsp) -{ - struct rcu_node *rnp = rcu_get_root(rsp); - - return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; -} - -/* - * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended - * grace period. - */ -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) -{ - wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); -} - -/* - * Set the root rcu_node structure's ->need_future_gp field - * based on the sum of those of all rcu_node structures. This does - * double-count the root rcu_node structure's requests, but this - * is necessary to handle the possibility of a rcu_nocb_kthread() - * having awakened during the time that the rcu_node structures - * were being updated for the end of the previous grace period. - */ -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ - rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; -} - -static void rcu_init_one_nocb(struct rcu_node *rnp) -{ - init_waitqueue_head(&rnp->nocb_gp_wq[0]); - init_waitqueue_head(&rnp->nocb_gp_wq[1]); -} - -/* Is the specified CPU a no-CPUs CPU? */ -bool rcu_is_nocb_cpu(int cpu) -{ - if (have_rcu_nocb_mask) - return cpumask_test_cpu(cpu, rcu_nocb_mask); - return false; -} - -/* - * Enqueue the specified string of rcu_head structures onto the specified - * CPU's no-CBs lists. The CPU is specified by rdp, the head of the - * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy - * counts are supplied by rhcount and rhcount_lazy. - * - * If warranted, also wake up the kthread servicing this CPUs queues. - */ -static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, - struct rcu_head *rhp, - struct rcu_head **rhtp, - int rhcount, int rhcount_lazy) -{ - int len; - struct rcu_head **old_rhpp; - struct task_struct *t; - - /* Enqueue the callback on the nocb list and update counts. */ - old_rhpp = xchg(&rdp->nocb_tail, rhtp); - ACCESS_ONCE(*old_rhpp) = rhp; - atomic_long_add(rhcount, &rdp->nocb_q_count); - atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); - - /* If we are not being polled and there is a kthread, awaken it ... */ - t = ACCESS_ONCE(rdp->nocb_kthread); - if (rcu_nocb_poll || !t) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeNotPoll")); - return; - } - len = atomic_long_read(&rdp->nocb_q_count); - if (old_rhpp == &rdp->nocb_head) { - wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ - rdp->qlen_last_fqs_check = 0; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); - } else if (len > rdp->qlen_last_fqs_check + qhimark) { - wake_up_process(t); /* ... or if many callbacks queued. */ - rdp->qlen_last_fqs_check = LONG_MAX / 2; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); - } else { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); - } - return; -} - -/* - * This is a helper for __call_rcu(), which invokes this when the normal - * callback queue is inoperable. If this is not a no-CBs CPU, this - * function returns failure back to __call_rcu(), which can complain - * appropriately. - * - * Otherwise, this function queues the callback where the corresponding - * "rcuo" kthread can find it. - */ -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy) -{ - - if (!rcu_is_nocb_cpu(rdp->cpu)) - return 0; - __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); - if (__is_kfree_rcu_offset((unsigned long)rhp->func)) - trace_rcu_kfree_callback(rdp->rsp->name, rhp, - (unsigned long)rhp->func, - -atomic_long_read(&rdp->nocb_q_count_lazy), - -atomic_long_read(&rdp->nocb_q_count)); - else - trace_rcu_callback(rdp->rsp->name, rhp, - -atomic_long_read(&rdp->nocb_q_count_lazy), - -atomic_long_read(&rdp->nocb_q_count)); - return 1; -} - -/* - * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is - * not a no-CBs CPU. - */ -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp) -{ - long ql = rsp->qlen; - long qll = rsp->qlen_lazy; - - /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ - if (!rcu_is_nocb_cpu(smp_processor_id())) - return 0; - rsp->qlen = 0; - rsp->qlen_lazy = 0; - - /* First, enqueue the donelist, if any. This preserves CB ordering. */ - if (rsp->orphan_donelist != NULL) { - __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, - rsp->orphan_donetail, ql, qll); - ql = qll = 0; - rsp->orphan_donelist = NULL; - rsp->orphan_donetail = &rsp->orphan_donelist; - } - if (rsp->orphan_nxtlist != NULL) { - __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, - rsp->orphan_nxttail, ql, qll); - ql = qll = 0; - rsp->orphan_nxtlist = NULL; - rsp->orphan_nxttail = &rsp->orphan_nxtlist; - } - return 1; -} - -/* - * If necessary, kick off a new grace period, and either way wait - * for a subsequent grace period to complete. - */ -static void rcu_nocb_wait_gp(struct rcu_data *rdp) -{ - unsigned long c; - bool d; - unsigned long flags; - struct rcu_node *rnp = rdp->mynode; - - raw_spin_lock_irqsave(&rnp->lock, flags); - c = rcu_start_future_gp(rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - - /* - * Wait for the grace period. Do so interruptibly to avoid messing - * up the load average. - */ - trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); - for (;;) { - wait_event_interruptible( - rnp->nocb_gp_wq[c & 0x1], - (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); - if (likely(d)) - break; - flush_signals(current); - trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); - } - trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); - smp_mb(); /* Ensure that CB invocation happens after GP end. */ -} - -/* - * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes - * callbacks queued by the corresponding no-CBs CPU. - */ -static int rcu_nocb_kthread(void *arg) -{ - int c, cl; - bool firsttime = 1; - struct rcu_head *list; - struct rcu_head *next; - struct rcu_head **tail; - struct rcu_data *rdp = arg; - - /* Each pass through this loop invokes one batch of callbacks */ - for (;;) { - /* If not polling, wait for next batch of callbacks. */ - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("Sleep")); - wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); - } else if (firsttime) { - firsttime = 0; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("Poll")); - } - list = ACCESS_ONCE(rdp->nocb_head); - if (!list) { - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeEmpty")); - schedule_timeout_interruptible(1); - flush_signals(current); - continue; - } - firsttime = 1; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeNonEmpty")); - - /* - * Extract queued callbacks, update counts, and wait - * for a grace period to elapse. - */ - ACCESS_ONCE(rdp->nocb_head) = NULL; - tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); - c = atomic_long_xchg(&rdp->nocb_q_count, 0); - cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); - ACCESS_ONCE(rdp->nocb_p_count) += c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; - rcu_nocb_wait_gp(rdp); - - /* Each pass through the following loop invokes a callback. */ - trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); - c = cl = 0; - while (list) { - next = list->next; - /* Wait for enqueuing to complete, if needed. */ - while (next == NULL && &list->next != tail) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WaitQueue")); - schedule_timeout_interruptible(1); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WokeQueue")); - next = list->next; - } - debug_rcu_head_unqueue(list); - local_bh_disable(); - if (__rcu_reclaim(rdp->rsp->name, list)) - cl++; - c++; - local_bh_enable(); - list = next; - } - trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); - ACCESS_ONCE(rdp->nocb_p_count) -= c; - ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; - rdp->n_nocbs_invoked += c; - } - return 0; -} - -/* Initialize per-rcu_data variables for no-CBs CPUs. */ -static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) -{ - rdp->nocb_tail = &rdp->nocb_head; - init_waitqueue_head(&rdp->nocb_wq); -} - -/* Create a kthread for each RCU flavor for each no-CBs CPU. */ -static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) -{ - int cpu; - struct rcu_data *rdp; - struct task_struct *t; - - if (rcu_nocb_mask == NULL) - return; - for_each_cpu(cpu, rcu_nocb_mask) { - rdp = per_cpu_ptr(rsp->rda, cpu); - t = kthread_run(rcu_nocb_kthread, rdp, - "rcuo%c/%d", rsp->abbr, cpu); - BUG_ON(IS_ERR(t)); - ACCESS_ONCE(rdp->nocb_kthread) = t; - } -} - -/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ -static bool init_nocb_callback_list(struct rcu_data *rdp) -{ - if (rcu_nocb_mask == NULL || - !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) - return false; - rdp->nxttail[RCU_NEXT_TAIL] = NULL; - return true; -} - -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ - -static int rcu_nocb_needs_gp(struct rcu_state *rsp) -{ - return 0; -} - -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) -{ -} - -static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -{ -} - -static void rcu_init_one_nocb(struct rcu_node *rnp) -{ -} - -static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, - bool lazy) -{ - return 0; -} - -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, - struct rcu_data *rdp) -{ - return 0; -} - -static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) -{ -} - -static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) -{ -} - -static bool init_nocb_callback_list(struct rcu_data *rdp) -{ - return false; -} - -#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ - -/* - * An adaptive-ticks CPU can potentially execute in kernel mode for an - * arbitrarily long period of time with the scheduling-clock tick turned - * off. RCU will be paying attention to this CPU because it is in the - * kernel, but the CPU cannot be guaranteed to be executing the RCU state - * machine because the scheduling-clock tick has been disabled. Therefore, - * if an adaptive-ticks CPU is failing to respond to the current grace - * period and has not be idle from an RCU perspective, kick it. - */ -static void rcu_kick_nohz_cpu(int cpu) -{ -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_cpu(cpu)) - smp_send_reschedule(cpu); -#endif /* #ifdef CONFIG_NO_HZ_FULL */ -} - - -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - -/* - * Define RCU flavor that holds sysidle state. This needs to be the - * most active flavor of RCU. - */ -#ifdef CONFIG_PREEMPT_RCU -static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; -#else /* #ifdef CONFIG_PREEMPT_RCU */ -static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ - -static int full_sysidle_state; /* Current system-idle state. */ -#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ -#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ -#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ -#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ -#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ - -/* - * Invoked to note exit from irq or task transition to idle. Note that - * usermode execution does -not- count as idle here! After all, we want - * to detect full-system idle states, not RCU quiescent states and grace - * periods. The caller must have disabled interrupts. - */ -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) -{ - unsigned long j; - - /* Adjust nesting, check for fully idle. */ - if (irq) { - rdtp->dynticks_idle_nesting--; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - if (rdtp->dynticks_idle_nesting != 0) - return; /* Still not fully idle. */ - } else { - if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == - DYNTICK_TASK_NEST_VALUE) { - rdtp->dynticks_idle_nesting = 0; - } else { - rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); - return; /* Still not fully idle. */ - } - } - - /* Record start of fully idle period. */ - j = jiffies; - ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; - smp_mb__before_atomic_inc(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic_inc(); - WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); -} - -/* - * Unconditionally force exit from full system-idle state. This is - * invoked when a normal CPU exits idle, but must be called separately - * for the timekeeping CPU (tick_do_timer_cpu). The reason for this - * is that the timekeeping CPU is permitted to take scheduling-clock - * interrupts while the system is in system-idle state, and of course - * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock - * interrupt from any other type of interrupt. - */ -void rcu_sysidle_force_exit(void) -{ - int oldstate = ACCESS_ONCE(full_sysidle_state); - int newoldstate; - - /* - * Each pass through the following loop attempts to exit full - * system-idle state. If contention proves to be a problem, - * a trylock-based contention tree could be used here. - */ - while (oldstate > RCU_SYSIDLE_SHORT) { - newoldstate = cmpxchg(&full_sysidle_state, - oldstate, RCU_SYSIDLE_NOT); - if (oldstate == newoldstate && - oldstate == RCU_SYSIDLE_FULL_NOTED) { - rcu_kick_nohz_cpu(tick_do_timer_cpu); - return; /* We cleared it, done! */ - } - oldstate = newoldstate; - } - smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ -} - -/* - * Invoked to note entry to irq or task transition from idle. Note that - * usermode execution does -not- count as idle here! The caller must - * have disabled interrupts. - */ -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) -{ - /* Adjust nesting, check for already non-idle. */ - if (irq) { - rdtp->dynticks_idle_nesting++; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - if (rdtp->dynticks_idle_nesting != 1) - return; /* Already non-idle. */ - } else { - /* - * Allow for irq misnesting. Yes, it really is possible - * to enter an irq handler then never leave it, and maybe - * also vice versa. Handle both possibilities. - */ - if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { - rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; - WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); - return; /* Already non-idle. */ - } else { - rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; - } - } - - /* Record end of idle period. */ - smp_mb__before_atomic_inc(); - atomic_inc(&rdtp->dynticks_idle); - smp_mb__after_atomic_inc(); - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); - - /* - * If we are the timekeeping CPU, we are permitted to be non-idle - * during a system-idle state. This must be the case, because - * the timekeeping CPU has to take scheduling-clock interrupts - * during the time that the system is transitioning to full - * system-idle state. This means that the timekeeping CPU must - * invoke rcu_sysidle_force_exit() directly if it does anything - * more than take a scheduling-clock interrupt. - */ - if (smp_processor_id() == tick_do_timer_cpu) - return; - - /* Update system-idle state: We are clearly no longer fully idle! */ - rcu_sysidle_force_exit(); -} - -/* - * Check to see if the current CPU is idle. Note that usermode execution - * does not count as idle. The caller must have disabled interrupts. - */ -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ - int cur; - unsigned long j; - struct rcu_dynticks *rdtp = rdp->dynticks; - - /* - * If some other CPU has already reported non-idle, if this is - * not the flavor of RCU that tracks sysidle state, or if this - * is an offline or the timekeeping CPU, nothing to do. - */ - if (!*isidle || rdp->rsp != rcu_sysidle_state || - cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) - return; - if (rcu_gp_in_progress(rdp->rsp)) - WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); - - /* Pick up current idle and NMI-nesting counter and check. */ - cur = atomic_read(&rdtp->dynticks_idle); - if (cur & 0x1) { - *isidle = false; /* We are not idle! */ - return; - } - smp_mb(); /* Read counters before timestamps. */ - - /* Pick up timestamps. */ - j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); - /* If this CPU entered idle more recently, update maxj timestamp. */ - if (ULONG_CMP_LT(*maxj, j)) - *maxj = j; -} - -/* - * Is this the flavor of RCU that is handling full-system idle? - */ -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return rsp == rcu_sysidle_state; -} - -/* - * Bind the grace-period kthread for the sysidle flavor of RCU to the - * timekeeping CPU. - */ -static void rcu_bind_gp_kthread(void) -{ - int cpu = ACCESS_ONCE(tick_do_timer_cpu); - - if (cpu < 0 || cpu >= nr_cpu_ids) - return; - if (raw_smp_processor_id() != cpu) - set_cpus_allowed_ptr(current, cpumask_of(cpu)); -} - -/* - * Return a delay in jiffies based on the number of CPUs, rcu_node - * leaf fanout, and jiffies tick rate. The idea is to allow larger - * systems more time to transition to full-idle state in order to - * avoid the cache thrashing that otherwise occur on the state variable. - * Really small systems (less than a couple of tens of CPUs) should - * instead use a single global atomically incremented counter, and later - * versions of this will automatically reconfigure themselves accordingly. - */ -static unsigned long rcu_sysidle_delay(void) -{ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return 0; - return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); -} - -/* - * Advance the full-system-idle state. This is invoked when all of - * the non-timekeeping CPUs are idle. - */ -static void rcu_sysidle(unsigned long j) -{ - /* Check the current state. */ - switch (ACCESS_ONCE(full_sysidle_state)) { - case RCU_SYSIDLE_NOT: - - /* First time all are idle, so note a short idle period. */ - ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; - break; - - case RCU_SYSIDLE_SHORT: - - /* - * Idle for a bit, time to advance to next state? - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); - break; - - case RCU_SYSIDLE_LONG: - - /* - * Do an additional check pass before advancing to full. - * cmpxchg failure means race with non-idle, let them win. - */ - if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) - (void)cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); - break; - - default: - break; - } -} - -/* - * Found a non-idle non-timekeeping CPU, so kick the system-idle state - * back to the beginning. - */ -static void rcu_sysidle_cancel(void) -{ - smp_mb(); - ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; -} - -/* - * Update the sysidle state based on the results of a force-quiescent-state - * scan of the CPUs' dyntick-idle state. - */ -static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, - unsigned long maxj, bool gpkt) -{ - if (rsp != rcu_sysidle_state) - return; /* Wrong flavor, ignore. */ - if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) - return; /* Running state machine from timekeeping CPU. */ - if (isidle) - rcu_sysidle(maxj); /* More idle! */ - else - rcu_sysidle_cancel(); /* Idle is over. */ -} - -/* - * Wrapper for rcu_sysidle_report() when called from the grace-period - * kthread's context. - */ -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ - rcu_sysidle_report(rsp, isidle, maxj, true); -} - -/* Callback and function for forcing an RCU grace period. */ -struct rcu_sysidle_head { - struct rcu_head rh; - int inuse; -}; - -static void rcu_sysidle_cb(struct rcu_head *rhp) -{ - struct rcu_sysidle_head *rshp; - - /* - * The following memory barrier is needed to replace the - * memory barriers that would normally be in the memory - * allocator. - */ - smp_mb(); /* grace period precedes setting inuse. */ - - rshp = container_of(rhp, struct rcu_sysidle_head, rh); - ACCESS_ONCE(rshp->inuse) = 0; -} - -/* - * Check to see if the system is fully idle, other than the timekeeping CPU. - * The caller must have disabled interrupts. - */ -bool rcu_sys_is_idle(void) -{ - static struct rcu_sysidle_head rsh; - int rss = ACCESS_ONCE(full_sysidle_state); - - if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) - return false; - - /* Handle small-system case by doing a full scan of CPUs. */ - if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { - int oldrss = rss - 1; - - /* - * One pass to advance to each state up to _FULL. - * Give up if any pass fails to advance the state. - */ - while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { - int cpu; - bool isidle = true; - unsigned long maxj = jiffies - ULONG_MAX / 4; - struct rcu_data *rdp; - - /* Scan all the CPUs looking for nonidle CPUs. */ - for_each_possible_cpu(cpu) { - rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); - rcu_sysidle_check_cpu(rdp, &isidle, &maxj); - if (!isidle) - break; - } - rcu_sysidle_report(rcu_sysidle_state, - isidle, maxj, false); - oldrss = rss; - rss = ACCESS_ONCE(full_sysidle_state); - } - } - - /* If this is the first observation of an idle period, record it. */ - if (rss == RCU_SYSIDLE_FULL) { - rss = cmpxchg(&full_sysidle_state, - RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); - return rss == RCU_SYSIDLE_FULL; - } - - smp_mb(); /* ensure rss load happens before later caller actions. */ - - /* If already fully idle, tell the caller (in case of races). */ - if (rss == RCU_SYSIDLE_FULL_NOTED) - return true; - - /* - * If we aren't there yet, and a grace period is not in flight, - * initiate a grace period. Either way, tell the caller that - * we are not there yet. We use an xchg() rather than an assignment - * to make up for the memory barriers that would otherwise be - * provided by the memory allocator. - */ - if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && - !rcu_gp_in_progress(rcu_sysidle_state) && - !rsh.inuse && xchg(&rsh.inuse, 1) == 0) - call_rcu(&rsh.rh, rcu_sysidle_cb); - return false; -} - -/* - * Initialize dynticks sysidle state for CPUs coming online. - */ -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ - rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; -} - -#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - -static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) -{ -} - -static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) -{ -} - -static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, - unsigned long *maxj) -{ -} - -static bool is_sysidle_rcu_state(struct rcu_state *rsp) -{ - return false; -} - -static void rcu_bind_gp_kthread(void) -{ -} - -static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, - unsigned long maxj) -{ -} - -static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) -{ -} - -#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c deleted file mode 100644 index cf6c17412932..000000000000 --- a/kernel/rcutree_trace.c +++ /dev/null @@ -1,500 +0,0 @@ -/* - * Read-Copy Update tracing for classic implementation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2008 - * - * Papers: http://www.rdrop.com/users/paulmck/RCU - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define RCU_TREE_NONCORE -#include "rcutree.h" - -static int r_open(struct inode *inode, struct file *file, - const struct seq_operations *op) -{ - int ret = seq_open(file, op); - if (!ret) { - struct seq_file *m = (struct seq_file *)file->private_data; - m->private = inode->i_private; - } - return ret; -} - -static void *r_start(struct seq_file *m, loff_t *pos) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - *pos = cpumask_next(*pos - 1, cpu_possible_mask); - if ((*pos) < nr_cpu_ids) - return per_cpu_ptr(rsp->rda, *pos); - return NULL; -} - -static void *r_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return r_start(m, pos); -} - -static void r_stop(struct seq_file *m, void *v) -{ -} - -static int show_rcubarrier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "bcc: %d nbd: %lu\n", - atomic_read(&rsp->barrier_cpu_count), - rsp->n_barrier_done); - return 0; -} - -static int rcubarrier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcubarrier, inode->i_private); -} - -static const struct file_operations rcubarrier_fops = { - .owner = THIS_MODULE, - .open = rcubarrier_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static char convert_kthread_status(unsigned int kthread_status) -{ - if (kthread_status > RCU_KTHREAD_MAX) - return '?'; - return "SRWOY"[kthread_status]; -} - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) -{ - long ql, qll; - - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->passed_quiesce, rdp->qs_pending); - seq_printf(m, " dt=%d/%llx/%d df=%lu", - atomic_read(&rdp->dynticks->dynticks), - rdp->dynticks->dynticks_nesting, - rdp->dynticks->dynticks_nmi_nesting, - rdp->dynticks_fqs); - seq_printf(m, " of=%lu", rdp->offline_fqs); - rcu_nocb_q_lengths(rdp, &ql, &qll); - qll += rdp->qlen_lazy; - ql += rdp->qlen; - seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", - qll, ql, - ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != - rdp->nxttail[RCU_NEXT_TAIL]], - ".R"[rdp->nxttail[RCU_WAIT_TAIL] != - rdp->nxttail[RCU_NEXT_READY_TAIL]], - ".W"[rdp->nxttail[RCU_DONE_TAIL] != - rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); -#ifdef CONFIG_RCU_BOOST - seq_printf(m, " kt=%d/%c ktl=%x", - per_cpu(rcu_cpu_has_work, rdp->cpu), - convert_kthread_status(per_cpu(rcu_cpu_kthread_status, - rdp->cpu)), - per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); -#endif /* #ifdef CONFIG_RCU_BOOST */ - seq_printf(m, " b=%ld", rdp->blimit); - seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", - rdp->n_cbs_invoked, rdp->n_nocbs_invoked, - rdp->n_cbs_orphaned, rdp->n_cbs_adopted); -} - -static int show_rcudata(struct seq_file *m, void *v) -{ - print_one_rcu_data(m, (struct rcu_data *)v); - return 0; -} - -static const struct seq_operations rcudate_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcudata, -}; - -static int rcudata_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcudate_op); -} - -static const struct file_operations rcudata_fops = { - .owner = THIS_MODULE, - .open = rcudata_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - -static int show_rcuexp(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - - seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", - atomic_long_read(&rsp->expedited_start), - atomic_long_read(&rsp->expedited_done), - atomic_long_read(&rsp->expedited_wrap), - atomic_long_read(&rsp->expedited_tryfail), - atomic_long_read(&rsp->expedited_workdone1), - atomic_long_read(&rsp->expedited_workdone2), - atomic_long_read(&rsp->expedited_normal), - atomic_long_read(&rsp->expedited_stoppedcpus), - atomic_long_read(&rsp->expedited_done_tries), - atomic_long_read(&rsp->expedited_done_lost), - atomic_long_read(&rsp->expedited_done_exit)); - return 0; -} - -static int rcuexp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuexp, inode->i_private); -} - -static const struct file_operations rcuexp_fops = { - .owner = THIS_MODULE, - .open = rcuexp_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#ifdef CONFIG_RCU_BOOST - -static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) -{ - seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", - rnp->grplo, rnp->grphi, - "T."[list_empty(&rnp->blkd_tasks)], - "N."[!rnp->gp_tasks], - "E."[!rnp->exp_tasks], - "B."[!rnp->boost_tasks], - convert_kthread_status(rnp->boost_kthread_status), - rnp->n_tasks_boosted, rnp->n_exp_boosts, - rnp->n_normal_boosts); - seq_printf(m, "j=%04x bt=%04x\n", - (int)(jiffies & 0xffff), - (int)(rnp->boost_time & 0xffff)); - seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", - rnp->n_balk_blkd_tasks, - rnp->n_balk_exp_gp_tasks, - rnp->n_balk_boost_tasks, - rnp->n_balk_notblocked, - rnp->n_balk_notyet, - rnp->n_balk_nos); -} - -static int show_rcu_node_boost(struct seq_file *m, void *unused) -{ - struct rcu_node *rnp; - - rcu_for_each_leaf_node(&rcu_preempt_state, rnp) - print_one_rcu_node_boost(m, rnp); - return 0; -} - -static int rcu_node_boost_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcu_node_boost, NULL); -} - -static const struct file_operations rcu_node_boost_fops = { - .owner = THIS_MODULE, - .open = rcu_node_boost_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -#endif /* #ifdef CONFIG_RCU_BOOST */ - -static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long gpnum; - int level = 0; - struct rcu_node *rnp; - - gpnum = rsp->gpnum; - seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", - ulong2long(rsp->completed), ulong2long(gpnum), - rsp->fqs_state, - (long)(rsp->jiffies_force_qs - jiffies), - (int)(jiffies & 0xffff)); - seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", - rsp->n_force_qs, rsp->n_force_qs_ngp, - rsp->n_force_qs - rsp->n_force_qs_ngp, - rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); - for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { - if (rnp->level != level) { - seq_puts(m, "\n"); - level = rnp->level; - } - seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", - rnp->qsmask, rnp->qsmaskinit, - ".G"[rnp->gp_tasks != NULL], - ".E"[rnp->exp_tasks != NULL], - ".T"[!list_empty(&rnp->blkd_tasks)], - rnp->grplo, rnp->grphi, rnp->grpnum); - } - seq_puts(m, "\n"); -} - -static int show_rcuhier(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - print_one_rcu_state(m, rsp); - return 0; -} - -static int rcuhier_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcuhier, inode->i_private); -} - -static const struct file_operations rcuhier_fops = { - .owner = THIS_MODULE, - .open = rcuhier_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long completed; - unsigned long gpnum; - unsigned long gpage; - unsigned long gpmax; - struct rcu_node *rnp = &rsp->node[0]; - - raw_spin_lock_irqsave(&rnp->lock, flags); - completed = ACCESS_ONCE(rsp->completed); - gpnum = ACCESS_ONCE(rsp->gpnum); - if (completed == gpnum) - gpage = 0; - else - gpage = jiffies - rsp->gp_start; - gpmax = rsp->gp_max; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", - ulong2long(completed), ulong2long(gpnum), gpage, gpmax); -} - -static int show_rcugp(struct seq_file *m, void *v) -{ - struct rcu_state *rsp = (struct rcu_state *)m->private; - show_one_rcugp(m, rsp); - return 0; -} - -static int rcugp_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcugp, inode->i_private); -} - -static const struct file_operations rcugp_fops = { - .owner = THIS_MODULE, - .open = rcugp_open, - .read = seq_read, - .llseek = no_llseek, - .release = single_release, -}; - -static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) -{ - if (!rdp->beenonline) - return; - seq_printf(m, "%3d%cnp=%ld ", - rdp->cpu, - cpu_is_offline(rdp->cpu) ? '!' : ' ', - rdp->n_rcu_pending); - seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", - rdp->n_rp_qs_pending, - rdp->n_rp_report_qs, - rdp->n_rp_cb_ready, - rdp->n_rp_cpu_needs_gp); - seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", - rdp->n_rp_gp_completed, - rdp->n_rp_gp_started, - rdp->n_rp_need_nothing); -} - -static int show_rcu_pending(struct seq_file *m, void *v) -{ - print_one_rcu_pending(m, (struct rcu_data *)v); - return 0; -} - -static const struct seq_operations rcu_pending_op = { - .start = r_start, - .next = r_next, - .stop = r_stop, - .show = show_rcu_pending, -}; - -static int rcu_pending_open(struct inode *inode, struct file *file) -{ - return r_open(inode, file, &rcu_pending_op); -} - -static const struct file_operations rcu_pending_fops = { - .owner = THIS_MODULE, - .open = rcu_pending_open, - .read = seq_read, - .llseek = no_llseek, - .release = seq_release, -}; - -static int show_rcutorture(struct seq_file *m, void *unused) -{ - seq_printf(m, "rcutorture test sequence: %lu %s\n", - rcutorture_testseq >> 1, - (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); - seq_printf(m, "rcutorture update version number: %lu\n", - rcutorture_vernum); - return 0; -} - -static int rcutorture_open(struct inode *inode, struct file *file) -{ - return single_open(file, show_rcutorture, NULL); -} - -static const struct file_operations rcutorture_fops = { - .owner = THIS_MODULE, - .open = rcutorture_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static struct dentry *rcudir; - -static int __init rcutree_trace_init(void) -{ - struct rcu_state *rsp; - struct dentry *retval; - struct dentry *rspdir; - - rcudir = debugfs_create_dir("rcu", NULL); - if (!rcudir) - goto free_out; - - for_each_rcu_flavor(rsp) { - rspdir = debugfs_create_dir(rsp->name, rcudir); - if (!rspdir) - goto free_out; - - retval = debugfs_create_file("rcudata", 0444, - rspdir, rsp, &rcudata_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuexp", 0444, - rspdir, rsp, &rcuexp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcu_pending", 0444, - rspdir, rsp, &rcu_pending_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcubarrier", 0444, - rspdir, rsp, &rcubarrier_fops); - if (!retval) - goto free_out; - -#ifdef CONFIG_RCU_BOOST - if (rsp == &rcu_preempt_state) { - retval = debugfs_create_file("rcuboost", 0444, - rspdir, NULL, &rcu_node_boost_fops); - if (!retval) - goto free_out; - } -#endif - - retval = debugfs_create_file("rcugp", 0444, - rspdir, rsp, &rcugp_fops); - if (!retval) - goto free_out; - - retval = debugfs_create_file("rcuhier", 0444, - rspdir, rsp, &rcuhier_fops); - if (!retval) - goto free_out; - } - - retval = debugfs_create_file("rcutorture", 0444, rcudir, - NULL, &rcutorture_fops); - if (!retval) - goto free_out; - return 0; -free_out: - debugfs_remove_recursive(rcudir); - return 1; -} - -static void __exit rcutree_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - - -module_init(rcutree_trace_init); -module_exit(rcutree_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); -MODULE_LICENSE("GPL"); diff --git a/kernel/srcu.c b/kernel/srcu.c deleted file mode 100644 index 01d5ccb8bfe3..000000000000 --- a/kernel/srcu.c +++ /dev/null @@ -1,651 +0,0 @@ -/* - * Sleepable Read-Copy Update mechanism for mutual exclusion. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2006 - * Copyright (C) Fujitsu, 2012 - * - * Author: Paul McKenney - * Lai Jiangshan - * - * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "rcu.h" - -/* - * Initialize an rcu_batch structure to empty. - */ -static inline void rcu_batch_init(struct rcu_batch *b) -{ - b->head = NULL; - b->tail = &b->head; -} - -/* - * Enqueue a callback onto the tail of the specified rcu_batch structure. - */ -static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) -{ - *b->tail = head; - b->tail = &head->next; -} - -/* - * Is the specified rcu_batch structure empty? - */ -static inline bool rcu_batch_empty(struct rcu_batch *b) -{ - return b->tail == &b->head; -} - -/* - * Remove the callback at the head of the specified rcu_batch structure - * and return a pointer to it, or return NULL if the structure is empty. - */ -static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) -{ - struct rcu_head *head; - - if (rcu_batch_empty(b)) - return NULL; - - head = b->head; - b->head = head->next; - if (b->tail == &head->next) - rcu_batch_init(b); - - return head; -} - -/* - * Move all callbacks from the rcu_batch structure specified by "from" to - * the structure specified by "to". - */ -static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) -{ - if (!rcu_batch_empty(from)) { - *to->tail = from->head; - to->tail = from->tail; - rcu_batch_init(from); - } -} - -static int init_srcu_struct_fields(struct srcu_struct *sp) -{ - sp->completed = 0; - spin_lock_init(&sp->queue_lock); - sp->running = false; - rcu_batch_init(&sp->batch_queue); - rcu_batch_init(&sp->batch_check0); - rcu_batch_init(&sp->batch_check1); - rcu_batch_init(&sp->batch_done); - INIT_DELAYED_WORK(&sp->work, process_srcu); - sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); - return sp->per_cpu_ref ? 0 : -ENOMEM; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -int __init_srcu_struct(struct srcu_struct *sp, const char *name, - struct lock_class_key *key) -{ - /* Don't re-initialize a lock while it is held. */ - debug_check_no_locks_freed((void *)sp, sizeof(*sp)); - lockdep_init_map(&sp->dep_map, name, key, 0); - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(__init_srcu_struct); - -#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/** - * init_srcu_struct - initialize a sleep-RCU structure - * @sp: structure to initialize. - * - * Must invoke this on a given srcu_struct before passing that srcu_struct - * to any other function. Each srcu_struct represents a separate domain - * of SRCU protection. - */ -int init_srcu_struct(struct srcu_struct *sp) -{ - return init_srcu_struct_fields(sp); -} -EXPORT_SYMBOL_GPL(init_srcu_struct); - -#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ - -/* - * Returns approximate total of the readers' ->seq[] values for the - * rank of per-CPU counters specified by idx. - */ -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - unsigned long sum = 0; - unsigned long t; - - for_each_possible_cpu(cpu) { - t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); - sum += t; - } - return sum; -} - -/* - * Returns approximate number of readers active on the specified rank - * of the per-CPU ->c[] counters. - */ -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) -{ - int cpu; - unsigned long sum = 0; - unsigned long t; - - for_each_possible_cpu(cpu) { - t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); - sum += t; - } - return sum; -} - -/* - * Return true if the number of pre-existing readers is determined to - * be stably zero. An example unstable zero can occur if the call - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, - * but due to task migration, sees the corresponding __srcu_read_unlock() - * decrement. This can happen because srcu_readers_active_idx() takes - * time to sum the array, and might in fact be interrupted or preempted - * partway through the summation. - */ -static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) -{ - unsigned long seq; - - seq = srcu_readers_seq_idx(sp, idx); - - /* - * The following smp_mb() A pairs with the smp_mb() B located in - * __srcu_read_lock(). This pairing ensures that if an - * __srcu_read_lock() increments its counter after the summation - * in srcu_readers_active_idx(), then the corresponding SRCU read-side - * critical section will see any changes made prior to the start - * of the current SRCU grace period. - * - * Also, if the above call to srcu_readers_seq_idx() saw the - * increment of ->seq[], then the call to srcu_readers_active_idx() - * must see the increment of ->c[]. - */ - smp_mb(); /* A */ - - /* - * Note that srcu_readers_active_idx() can incorrectly return - * zero even though there is a pre-existing reader throughout. - * To see this, suppose that task A is in a very long SRCU - * read-side critical section that started on CPU 0, and that - * no other reader exists, so that the sum of the counters - * is equal to one. Then suppose that task B starts executing - * srcu_readers_active_idx(), summing up to CPU 1, and then that - * task C starts reading on CPU 0, so that its increment is not - * summed, but finishes reading on CPU 2, so that its decrement - * -is- summed. Then when task B completes its sum, it will - * incorrectly get zero, despite the fact that task A has been - * in its SRCU read-side critical section the whole time. - * - * We therefore do a validation step should srcu_readers_active_idx() - * return zero. - */ - if (srcu_readers_active_idx(sp, idx) != 0) - return false; - - /* - * The remainder of this function is the validation step. - * The following smp_mb() D pairs with the smp_mb() C in - * __srcu_read_unlock(). If the __srcu_read_unlock() was seen - * by srcu_readers_active_idx() above, then any destructive - * operation performed after the grace period will happen after - * the corresponding SRCU read-side critical section. - * - * Note that there can be at most NR_CPUS worth of readers using - * the old index, which is not enough to overflow even a 32-bit - * integer. (Yes, this does mean that systems having more than - * a billion or so CPUs need to be 64-bit systems.) Therefore, - * the sum of the ->seq[] counters cannot possibly overflow. - * Therefore, the only way that the return values of the two - * calls to srcu_readers_seq_idx() can be equal is if there were - * no increments of the corresponding rank of ->seq[] counts - * in the interim. But the missed-increment scenario laid out - * above includes an increment of the ->seq[] counter by - * the corresponding __srcu_read_lock(). Therefore, if this - * scenario occurs, the return values from the two calls to - * srcu_readers_seq_idx() will differ, and thus the validation - * step below suffices. - */ - smp_mb(); /* D */ - - return srcu_readers_seq_idx(sp, idx) == seq; -} - -/** - * srcu_readers_active - returns approximate number of readers. - * @sp: which srcu_struct to count active readers (holding srcu_read_lock). - * - * Note that this is not an atomic primitive, and can therefore suffer - * severe errors when invoked on an active srcu_struct. That said, it - * can be useful as an error check at cleanup time. - */ -static int srcu_readers_active(struct srcu_struct *sp) -{ - int cpu; - unsigned long sum = 0; - - for_each_possible_cpu(cpu) { - sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); - sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); - } - return sum; -} - -/** - * cleanup_srcu_struct - deconstruct a sleep-RCU structure - * @sp: structure to clean up. - * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. - */ -void cleanup_srcu_struct(struct srcu_struct *sp) -{ - if (WARN_ON(srcu_readers_active(sp))) - return; /* Leakage unless caller handles error. */ - free_percpu(sp->per_cpu_ref); - sp->per_cpu_ref = NULL; -} -EXPORT_SYMBOL_GPL(cleanup_srcu_struct); - -/* - * Counts the new reader in the appropriate per-CPU element of the - * srcu_struct. Must be called from process context. - * Returns an index that must be passed to the matching srcu_read_unlock(). - */ -int __srcu_read_lock(struct srcu_struct *sp) -{ - int idx; - - idx = ACCESS_ONCE(sp->completed) & 0x1; - preempt_disable(); - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; - smp_mb(); /* B */ /* Avoid leaking the critical section. */ - ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; - preempt_enable(); - return idx; -} -EXPORT_SYMBOL_GPL(__srcu_read_lock); - -/* - * Removes the count for the old reader from the appropriate per-CPU - * element of the srcu_struct. Note that this may well be a different - * CPU than that which was incremented by the corresponding srcu_read_lock(). - * Must be called from process context. - */ -void __srcu_read_unlock(struct srcu_struct *sp, int idx) -{ - smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_dec(sp->per_cpu_ref->c[idx]); -} -EXPORT_SYMBOL_GPL(__srcu_read_unlock); - -/* - * We use an adaptive strategy for synchronize_srcu() and especially for - * synchronize_srcu_expedited(). We spin for a fixed time period - * (defined below) to allow SRCU readers to exit their read-side critical - * sections. If there are still some readers after 10 microseconds, - * we repeatedly block for 1-millisecond time periods. This approach - * has done well in testing, so there is no need for a config parameter. - */ -#define SRCU_RETRY_CHECK_DELAY 5 -#define SYNCHRONIZE_SRCU_TRYCOUNT 2 -#define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 - -/* - * @@@ Wait until all pre-existing readers complete. Such readers - * will have used the index specified by "idx". - * the caller should ensures the ->completed is not changed while checking - * and idx = (->completed & 1) ^ 1 - */ -static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) -{ - for (;;) { - if (srcu_readers_active_idx_check(sp, idx)) - return true; - if (--trycount <= 0) - return false; - udelay(SRCU_RETRY_CHECK_DELAY); - } -} - -/* - * Increment the ->completed counter so that future SRCU readers will - * use the other rank of the ->c[] and ->seq[] arrays. This allows - * us to wait for pre-existing readers in a starvation-free manner. - */ -static void srcu_flip(struct srcu_struct *sp) -{ - sp->completed++; -} - -/* - * Enqueue an SRCU callback on the specified srcu_struct structure, - * initiating grace-period processing if it is not already running. - */ -void call_srcu(struct srcu_struct *sp, struct rcu_head *head, - void (*func)(struct rcu_head *head)) -{ - unsigned long flags; - - head->next = NULL; - head->func = func; - spin_lock_irqsave(&sp->queue_lock, flags); - rcu_batch_queue(&sp->batch_queue, head); - if (!sp->running) { - sp->running = true; - schedule_delayed_work(&sp->work, 0); - } - spin_unlock_irqrestore(&sp->queue_lock, flags); -} -EXPORT_SYMBOL_GPL(call_srcu); - -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; -}; - -/* - * Awaken the corresponding synchronize_srcu() instance now that a - * grace period has elapsed. - */ -static void wakeme_after_rcu(struct rcu_head *head) -{ - struct rcu_synchronize *rcu; - - rcu = container_of(head, struct rcu_synchronize, head); - complete(&rcu->completion); -} - -static void srcu_advance_batches(struct srcu_struct *sp, int trycount); -static void srcu_reschedule(struct srcu_struct *sp); - -/* - * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). - */ -static void __synchronize_srcu(struct srcu_struct *sp, int trycount) -{ - struct rcu_synchronize rcu; - struct rcu_head *head = &rcu.head; - bool done = false; - - rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && - !lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); - - might_sleep(); - init_completion(&rcu.completion); - - head->next = NULL; - head->func = wakeme_after_rcu; - spin_lock_irq(&sp->queue_lock); - if (!sp->running) { - /* steal the processing owner */ - sp->running = true; - rcu_batch_queue(&sp->batch_check0, head); - spin_unlock_irq(&sp->queue_lock); - - srcu_advance_batches(sp, trycount); - if (!rcu_batch_empty(&sp->batch_done)) { - BUG_ON(sp->batch_done.head != head); - rcu_batch_dequeue(&sp->batch_done); - done = true; - } - /* give the processing owner to work_struct */ - srcu_reschedule(sp); - } else { - rcu_batch_queue(&sp->batch_queue, head); - spin_unlock_irq(&sp->queue_lock); - } - - if (!done) - wait_for_completion(&rcu.completion); -} - -/** - * synchronize_srcu - wait for prior SRCU read-side critical-section completion - * @sp: srcu_struct with which to synchronize. - * - * Wait for the count to drain to zero of both indexes. To avoid the - * possible starvation of synchronize_srcu(), it waits for the count of - * the index=((->completed & 1) ^ 1) to drain to zero at first, - * and then flip the completed and wait for the count of the other index. - * - * Can block; must be called from process context. - * - * Note that it is illegal to call synchronize_srcu() from the corresponding - * SRCU read-side critical section; doing so will result in deadlock. - * However, it is perfectly legal to call synchronize_srcu() on one - * srcu_struct from some other srcu_struct's read-side critical section. - */ -void synchronize_srcu(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, rcu_expedited - ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT - : SYNCHRONIZE_SRCU_TRYCOUNT); -} -EXPORT_SYMBOL_GPL(synchronize_srcu); - -/** - * synchronize_srcu_expedited - Brute-force SRCU grace period - * @sp: srcu_struct with which to synchronize. - * - * Wait for an SRCU grace period to elapse, but be more aggressive about - * spinning rather than blocking when waiting. - * - * Note that it is also illegal to call synchronize_srcu_expedited() - * from the corresponding SRCU read-side critical section; - * doing so will result in deadlock. However, it is perfectly legal - * to call synchronize_srcu_expedited() on one srcu_struct from some - * other srcu_struct's read-side critical section, as long as - * the resulting graph of srcu_structs is acyclic. - */ -void synchronize_srcu_expedited(struct srcu_struct *sp) -{ - __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); -} -EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); - -/** - * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. - */ -void srcu_barrier(struct srcu_struct *sp) -{ - synchronize_srcu(sp); -} -EXPORT_SYMBOL_GPL(srcu_barrier); - -/** - * srcu_batches_completed - return batches completed. - * @sp: srcu_struct on which to report batch completion. - * - * Report the number of batches, correlated with, but not necessarily - * precisely the same as, the number of grace periods that have elapsed. - */ -long srcu_batches_completed(struct srcu_struct *sp) -{ - return sp->completed; -} -EXPORT_SYMBOL_GPL(srcu_batches_completed); - -#define SRCU_CALLBACK_BATCH 10 -#define SRCU_INTERVAL 1 - -/* - * Move any new SRCU callbacks to the first stage of the SRCU grace - * period pipeline. - */ -static void srcu_collect_new(struct srcu_struct *sp) -{ - if (!rcu_batch_empty(&sp->batch_queue)) { - spin_lock_irq(&sp->queue_lock); - rcu_batch_move(&sp->batch_check0, &sp->batch_queue); - spin_unlock_irq(&sp->queue_lock); - } -} - -/* - * Core SRCU state machine. Advance callbacks from ->batch_check0 to - * ->batch_check1 and then to ->batch_done as readers drain. - */ -static void srcu_advance_batches(struct srcu_struct *sp, int trycount) -{ - int idx = 1 ^ (sp->completed & 1); - - /* - * Because readers might be delayed for an extended period after - * fetching ->completed for their index, at any point in time there - * might well be readers using both idx=0 and idx=1. We therefore - * need to wait for readers to clear from both index values before - * invoking a callback. - */ - - if (rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_check1)) - return; /* no callbacks need to be advanced */ - - if (!try_check_zero(sp, idx, trycount)) - return; /* failed to advance, will try after SRCU_INTERVAL */ - - /* - * The callbacks in ->batch_check1 have already done with their - * first zero check and flip back when they were enqueued on - * ->batch_check0 in a previous invocation of srcu_advance_batches(). - * (Presumably try_check_zero() returned false during that - * invocation, leaving the callbacks stranded on ->batch_check1.) - * They are therefore ready to invoke, so move them to ->batch_done. - */ - rcu_batch_move(&sp->batch_done, &sp->batch_check1); - - if (rcu_batch_empty(&sp->batch_check0)) - return; /* no callbacks need to be advanced */ - srcu_flip(sp); - - /* - * The callbacks in ->batch_check0 just finished their - * first check zero and flip, so move them to ->batch_check1 - * for future checking on the other idx. - */ - rcu_batch_move(&sp->batch_check1, &sp->batch_check0); - - /* - * SRCU read-side critical sections are normally short, so check - * at least twice in quick succession after a flip. - */ - trycount = trycount < 2 ? 2 : trycount; - if (!try_check_zero(sp, idx^1, trycount)) - return; /* failed to advance, will try after SRCU_INTERVAL */ - - /* - * The callbacks in ->batch_check1 have now waited for all - * pre-existing readers using both idx values. They are therefore - * ready to invoke, so move them to ->batch_done. - */ - rcu_batch_move(&sp->batch_done, &sp->batch_check1); -} - -/* - * Invoke a limited number of SRCU callbacks that have passed through - * their grace period. If there are more to do, SRCU will reschedule - * the workqueue. - */ -static void srcu_invoke_callbacks(struct srcu_struct *sp) -{ - int i; - struct rcu_head *head; - - for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { - head = rcu_batch_dequeue(&sp->batch_done); - if (!head) - break; - local_bh_disable(); - head->func(head); - local_bh_enable(); - } -} - -/* - * Finished one round of SRCU grace period. Start another if there are - * more SRCU callbacks queued, otherwise put SRCU into not-running state. - */ -static void srcu_reschedule(struct srcu_struct *sp) -{ - bool pending = true; - - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { - spin_lock_irq(&sp->queue_lock); - if (rcu_batch_empty(&sp->batch_done) && - rcu_batch_empty(&sp->batch_check1) && - rcu_batch_empty(&sp->batch_check0) && - rcu_batch_empty(&sp->batch_queue)) { - sp->running = false; - pending = false; - } - spin_unlock_irq(&sp->queue_lock); - } - - if (pending) - schedule_delayed_work(&sp->work, SRCU_INTERVAL); -} - -/* - * This is the work-queue function that handles SRCU grace periods. - */ -void process_srcu(struct work_struct *work) -{ - struct srcu_struct *sp; - - sp = container_of(work, struct srcu_struct, work.work); - - srcu_collect_new(sp); - srcu_advance_batches(sp, 1); - srcu_invoke_callbacks(sp); - srcu_reschedule(sp); -} -EXPORT_SYMBOL_GPL(process_srcu); -- cgit v1.3 From 7a6354e241d8fbc145836ac24e47630f12754536 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 18:07:08 +0100 Subject: sched: Move wait.c into kernel/sched/ Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-5q5yqvdaen0rmapwloeaotx3@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/Makefile | 2 +- kernel/sched/Makefile | 1 + kernel/sched/wait.c | 401 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/wait.c | 401 -------------------------------------------------- 4 files changed, 403 insertions(+), 402 deletions(-) create mode 100644 kernel/sched/wait.c delete mode 100644 kernel/wait.c (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index 1ce47553fb02..b3d51e229356 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ rcupdate.o extable.o params.o posix-timers.o \ - kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 54adcf35f495..f8d3f4baa1a1 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o +obj-y += wait.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c new file mode 100644 index 000000000000..de21c6305a44 --- /dev/null +++ b/kernel/sched/wait.c @@ -0,0 +1,401 @@ +/* + * Generic waiting primitives. + * + * (C) 2004 Nadia Yvette Chambers, Oracle + */ +#include +#include +#include +#include +#include +#include + +void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) +{ + spin_lock_init(&q->lock); + lockdep_set_class_and_name(&q->lock, key, name); + INIT_LIST_HEAD(&q->task_list); +} + +EXPORT_SYMBOL(__init_waitqueue_head); + +void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue); + +void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue_tail(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive); + +void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __remove_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(remove_wait_queue); + + +/* + * Note: we use "set_current_state()" _after_ the wait-queue add, + * because we need a memory barrier there on SMP, so that any + * wake-function that tests for the wait-queue being active + * will be guaranteed to see waitqueue addition _or_ subsequent + * tests in this thread will see the wakeup having taken place. + * + * The spin_unlock() itself is semi-permeable and only protects + * one way (it only protects stuff inside the critical region and + * stops them from bleeding out - it would still allow subsequent + * loads to move into the critical region). + */ +void +prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait); + +void +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue_tail(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive); + +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + if (signal_pending_state(state, current)) + return -ERESTARTSYS; + + wait->private = current; + wait->func = autoremove_wake_function; + + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) { + if (wait->flags & WQ_FLAG_EXCLUSIVE) + __add_wait_queue_tail(q, wait); + else + __add_wait_queue(q, wait); + } + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); + + return 0; +} +EXPORT_SYMBOL(prepare_to_wait_event); + +/** + * finish_wait - clean up after waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + */ +void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + /* + * We can check for list emptiness outside the lock + * IFF: + * - we use the "careful" check that verifies both + * the next and prev pointers, so that there cannot + * be any half-pending updates in progress on other + * CPU's that we haven't seen yet (and that might + * still change the stack area. + * and + * - all other users take the lock (ie we can only + * have _one_ other CPU that looks at or modifies + * the list). + */ + if (!list_empty_careful(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_wait); + +/** + * abort_exclusive_wait - abort exclusive waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * @mode: runstate of the waiter to be woken + * @key: key to identify a wait bit queue or %NULL + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + * + * Wakes up the next waiter if the caller is concurrently + * woken up through the queue. + * + * This prevents waiter starvation where an exclusive waiter + * aborts and is woken up concurrently and no one wakes up + * the next waiter. + */ +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, + unsigned int mode, void *key) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + spin_lock_irqsave(&q->lock, flags); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); + else if (waitqueue_active(q)) + __wake_up_locked_key(q, mode, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(abort_exclusive_wait); + +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + + if (ret) + list_del_init(&wait->task_list); + return ret; +} +EXPORT_SYMBOL(autoremove_wake_function); + +int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + test_bit(key->bit_nr, key->flags)) + return 0; + else + return autoremove_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(wake_bit_function); + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are + * permitted return codes. Nonzero return codes halt waiting and return. + */ +int __sched +__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + if (test_bit(q->key.bit_nr, q->key.flags)) + ret = (*action)(q->key.flags); + } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); + finish_wait(wq, &q->wait); + return ret; +} +EXPORT_SYMBOL(__wait_on_bit); + +int __sched out_of_line_wait_on_bit(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit); + +int __sched +__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + do { + int ret; + + prepare_to_wait_exclusive(wq, &q->wait, mode); + if (!test_bit(q->key.bit_nr, q->key.flags)) + continue; + ret = action(q->key.flags); + if (!ret) + continue; + abort_exclusive_wait(wq, &q->wait, mode, &q->key); + return ret; + } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); + finish_wait(wq, &q->wait); + return 0; +} +EXPORT_SYMBOL(__wait_on_bit_lock); + +int __sched out_of_line_wait_on_bit_lock(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit_lock(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); + +void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) +{ + struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &key); +} +EXPORT_SYMBOL(__wake_up_bit); + +/** + * wake_up_bit - wake up a waiter on a bit + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that wakes up waiters + * on a bit. For instance, if one were to have waiters on a bitflag, + * one would call wake_up_bit() after clearing the bit. + * + * In order for this to function properly, as it uses waitqueue_active() + * internally, some kind of memory barrier must be done prior to calling + * this. Typically, this will be smp_mb__after_clear_bit(), but in some + * cases where bitflags are manipulated non-atomically under a lock, one + * may need to use a less regular barrier, such fs/inode.c's smp_mb(), + * because spin_unlock() does not guarantee a memory barrier. + */ +void wake_up_bit(void *word, int bit) +{ + __wake_up_bit(bit_waitqueue(word, bit), word, bit); +} +EXPORT_SYMBOL(wake_up_bit); + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + const struct zone *zone = page_zone(virt_to_page(word)); + unsigned long val = (unsigned long)word << shift | bit; + + return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; +} +EXPORT_SYMBOL(bit_waitqueue); + +/* + * Manipulate the atomic_t address to produce a better bit waitqueue table hash + * index (we're keying off bit -1, but that would produce a horrible hash + * value). + */ +static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) +{ + if (BITS_PER_LONG == 64) { + unsigned long q = (unsigned long)p; + return bit_waitqueue((void *)(q & ~1), q & 1); + } + return bit_waitqueue(p, 0); +} + +static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, + void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + atomic_t *val = key->flags; + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + atomic_read(val) != 0) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, + * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero + * return codes halt waiting and return. + */ +static __sched +int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(atomic_t *), unsigned mode) +{ + atomic_t *val; + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + val = q->key.flags; + if (atomic_read(val) == 0) + break; + ret = (*action)(val); + } while (!ret && atomic_read(val) != 0); + finish_wait(wq, &q->wait); + return ret; +} + +#define DEFINE_WAIT_ATOMIC_T(name, p) \ + struct wait_bit_queue name = { \ + .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ + .wait = { \ + .private = current, \ + .func = wake_atomic_t_function, \ + .task_list = \ + LIST_HEAD_INIT((name).wait.task_list), \ + }, \ + } + +__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), + unsigned mode) +{ + wait_queue_head_t *wq = atomic_t_waitqueue(p); + DEFINE_WAIT_ATOMIC_T(wait, p); + + return __wait_on_atomic_t(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); + +/** + * wake_up_atomic_t - Wake up a waiter on a atomic_t + * @p: The atomic_t being waited on, a kernel virtual address + * + * Wake up anyone waiting for the atomic_t to go to zero. + * + * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t + * check is done by the waiter's wake function, not the by the waker itself). + */ +void wake_up_atomic_t(atomic_t *p) +{ + __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); +} +EXPORT_SYMBOL(wake_up_atomic_t); diff --git a/kernel/wait.c b/kernel/wait.c deleted file mode 100644 index de21c6305a44..000000000000 --- a/kernel/wait.c +++ /dev/null @@ -1,401 +0,0 @@ -/* - * Generic waiting primitives. - * - * (C) 2004 Nadia Yvette Chambers, Oracle - */ -#include -#include -#include -#include -#include -#include - -void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) -{ - spin_lock_init(&q->lock); - lockdep_set_class_and_name(&q->lock, key, name); - INIT_LIST_HEAD(&q->task_list); -} - -EXPORT_SYMBOL(__init_waitqueue_head); - -void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(add_wait_queue); - -void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - __add_wait_queue_tail(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(add_wait_queue_exclusive); - -void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __remove_wait_queue(q, wait); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(remove_wait_queue); - - -/* - * Note: we use "set_current_state()" _after_ the wait-queue add, - * because we need a memory barrier there on SMP, so that any - * wake-function that tests for the wait-queue being active - * will be guaranteed to see waitqueue addition _or_ subsequent - * tests in this thread will see the wakeup having taken place. - * - * The spin_unlock() itself is semi-permeable and only protects - * one way (it only protects stuff inside the critical region and - * stops them from bleeding out - it would still allow subsequent - * loads to move into the critical region). - */ -void -prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - wait->flags &= ~WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue(q, wait); - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(prepare_to_wait); - -void -prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - wait->flags |= WQ_FLAG_EXCLUSIVE; - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) - __add_wait_queue_tail(q, wait); - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(prepare_to_wait_exclusive); - -long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) -{ - unsigned long flags; - - if (signal_pending_state(state, current)) - return -ERESTARTSYS; - - wait->private = current; - wait->func = autoremove_wake_function; - - spin_lock_irqsave(&q->lock, flags); - if (list_empty(&wait->task_list)) { - if (wait->flags & WQ_FLAG_EXCLUSIVE) - __add_wait_queue_tail(q, wait); - else - __add_wait_queue(q, wait); - } - set_current_state(state); - spin_unlock_irqrestore(&q->lock, flags); - - return 0; -} -EXPORT_SYMBOL(prepare_to_wait_event); - -/** - * finish_wait - clean up after waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - */ -void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - /* - * We can check for list emptiness outside the lock - * IFF: - * - we use the "careful" check that verifies both - * the next and prev pointers, so that there cannot - * be any half-pending updates in progress on other - * CPU's that we haven't seen yet (and that might - * still change the stack area. - * and - * - all other users take the lock (ie we can only - * have _one_ other CPU that looks at or modifies - * the list). - */ - if (!list_empty_careful(&wait->task_list)) { - spin_lock_irqsave(&q->lock, flags); - list_del_init(&wait->task_list); - spin_unlock_irqrestore(&q->lock, flags); - } -} -EXPORT_SYMBOL(finish_wait); - -/** - * abort_exclusive_wait - abort exclusive waiting in a queue - * @q: waitqueue waited on - * @wait: wait descriptor - * @mode: runstate of the waiter to be woken - * @key: key to identify a wait bit queue or %NULL - * - * Sets current thread back to running state and removes - * the wait descriptor from the given waitqueue if still - * queued. - * - * Wakes up the next waiter if the caller is concurrently - * woken up through the queue. - * - * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and no one wakes up - * the next waiter. - */ -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, - unsigned int mode, void *key) -{ - unsigned long flags; - - __set_current_state(TASK_RUNNING); - spin_lock_irqsave(&q->lock, flags); - if (!list_empty(&wait->task_list)) - list_del_init(&wait->task_list); - else if (waitqueue_active(q)) - __wake_up_locked_key(q, mode, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(abort_exclusive_wait); - -int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) -{ - int ret = default_wake_function(wait, mode, sync, key); - - if (ret) - list_del_init(&wait->task_list); - return ret; -} -EXPORT_SYMBOL(autoremove_wake_function); - -int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - test_bit(key->bit_nr, key->flags)) - return 0; - else - return autoremove_wake_function(wait, mode, sync, key); -} -EXPORT_SYMBOL(wake_bit_function); - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) - * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are - * permitted return codes. Nonzero return codes halt waiting and return. - */ -int __sched -__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) -{ - int ret = 0; - - do { - prepare_to_wait(wq, &q->wait, mode); - if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(q->key.flags); - } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); - finish_wait(wq, &q->wait); - return ret; -} -EXPORT_SYMBOL(__wait_on_bit); - -int __sched out_of_line_wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit); - -int __sched -__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(void *), unsigned mode) -{ - do { - int ret; - - prepare_to_wait_exclusive(wq, &q->wait, mode); - if (!test_bit(q->key.bit_nr, q->key.flags)) - continue; - ret = action(q->key.flags); - if (!ret) - continue; - abort_exclusive_wait(wq, &q->wait, mode, &q->key); - return ret; - } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); - finish_wait(wq, &q->wait); - return 0; -} -EXPORT_SYMBOL(__wait_on_bit_lock); - -int __sched out_of_line_wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) -{ - wait_queue_head_t *wq = bit_waitqueue(word, bit); - DEFINE_WAIT_BIT(wait, word, bit); - - return __wait_on_bit_lock(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); - -void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) -{ - struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, 1, &key); -} -EXPORT_SYMBOL(__wake_up_bit); - -/** - * wake_up_bit - wake up a waiter on a bit - * @word: the word being waited on, a kernel virtual address - * @bit: the bit of the word being waited on - * - * There is a standard hashed waitqueue table for generic use. This - * is the part of the hashtable's accessor API that wakes up waiters - * on a bit. For instance, if one were to have waiters on a bitflag, - * one would call wake_up_bit() after clearing the bit. - * - * In order for this to function properly, as it uses waitqueue_active() - * internally, some kind of memory barrier must be done prior to calling - * this. Typically, this will be smp_mb__after_clear_bit(), but in some - * cases where bitflags are manipulated non-atomically under a lock, one - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), - * because spin_unlock() does not guarantee a memory barrier. - */ -void wake_up_bit(void *word, int bit) -{ - __wake_up_bit(bit_waitqueue(word, bit), word, bit); -} -EXPORT_SYMBOL(wake_up_bit); - -wait_queue_head_t *bit_waitqueue(void *word, int bit) -{ - const int shift = BITS_PER_LONG == 32 ? 5 : 6; - const struct zone *zone = page_zone(virt_to_page(word)); - unsigned long val = (unsigned long)word << shift | bit; - - return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; -} -EXPORT_SYMBOL(bit_waitqueue); - -/* - * Manipulate the atomic_t address to produce a better bit waitqueue table hash - * index (we're keying off bit -1, but that would produce a horrible hash - * value). - */ -static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) -{ - if (BITS_PER_LONG == 64) { - unsigned long q = (unsigned long)p; - return bit_waitqueue((void *)(q & ~1), q & 1); - } - return bit_waitqueue(p, 0); -} - -static int wake_atomic_t_function(wait_queue_t *wait, unsigned mode, int sync, - void *arg) -{ - struct wait_bit_key *key = arg; - struct wait_bit_queue *wait_bit - = container_of(wait, struct wait_bit_queue, wait); - atomic_t *val = key->flags; - - if (wait_bit->key.flags != key->flags || - wait_bit->key.bit_nr != key->bit_nr || - atomic_read(val) != 0) - return 0; - return autoremove_wake_function(wait, mode, sync, key); -} - -/* - * To allow interruptible waiting and asynchronous (i.e. nonblocking) waiting, - * the actions of __wait_on_atomic_t() are permitted return codes. Nonzero - * return codes halt waiting and return. - */ -static __sched -int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q, - int (*action)(atomic_t *), unsigned mode) -{ - atomic_t *val; - int ret = 0; - - do { - prepare_to_wait(wq, &q->wait, mode); - val = q->key.flags; - if (atomic_read(val) == 0) - break; - ret = (*action)(val); - } while (!ret && atomic_read(val) != 0); - finish_wait(wq, &q->wait); - return ret; -} - -#define DEFINE_WAIT_ATOMIC_T(name, p) \ - struct wait_bit_queue name = { \ - .key = __WAIT_ATOMIC_T_KEY_INITIALIZER(p), \ - .wait = { \ - .private = current, \ - .func = wake_atomic_t_function, \ - .task_list = \ - LIST_HEAD_INIT((name).wait.task_list), \ - }, \ - } - -__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *), - unsigned mode) -{ - wait_queue_head_t *wq = atomic_t_waitqueue(p); - DEFINE_WAIT_ATOMIC_T(wait, p); - - return __wait_on_atomic_t(wq, &wait, action, mode); -} -EXPORT_SYMBOL(out_of_line_wait_on_atomic_t); - -/** - * wake_up_atomic_t - Wake up a waiter on a atomic_t - * @p: The atomic_t being waited on, a kernel virtual address - * - * Wake up anyone waiting for the atomic_t to go to zero. - * - * Abuse the bit-waker function and its waitqueue hash table set (the atomic_t - * check is done by the waiter's wake function, not the by the waker itself). - */ -void wake_up_atomic_t(atomic_t *p) -{ - __wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR); -} -EXPORT_SYMBOL(wake_up_atomic_t); -- cgit v1.3 From 01768b42dc97a67b4fb33a2535c49fc1969880df Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 18:11:53 +0100 Subject: locking: Move the mutex code to kernel/locking/ Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-1ditvncg30dgbpvrz2bxfmke@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/Makefile | 6 +- kernel/locking/Makefile | 9 + kernel/locking/mutex-debug.c | 110 +++++ kernel/locking/mutex-debug.h | 55 +++ kernel/locking/mutex.c | 960 +++++++++++++++++++++++++++++++++++++++++++ kernel/locking/mutex.h | 48 +++ kernel/mutex-debug.c | 110 ----- kernel/mutex-debug.h | 55 --- kernel/mutex.c | 960 ------------------------------------------- kernel/mutex.h | 48 --- 10 files changed, 1184 insertions(+), 1177 deletions(-) create mode 100644 kernel/locking/Makefile create mode 100644 kernel/locking/mutex-debug.c create mode 100644 kernel/locking/mutex-debug.h create mode 100644 kernel/locking/mutex.c create mode 100644 kernel/locking/mutex.h delete mode 100644 kernel/mutex-debug.c delete mode 100644 kernel/mutex-debug.h delete mode 100644 kernel/mutex.c delete mode 100644 kernel/mutex.h (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index a4d1aa8da9bc..330b14666475 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ extable.o params.o posix-timers.o \ - kthread.o sys_ni.o posix-cpu-timers.o mutex.o \ + kthread.o sys_ni.o posix-cpu-timers.o \ hrtimer.o rwsem.o nsproxy.o semaphore.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o @@ -16,13 +16,12 @@ ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg CFLAGS_REMOVE_lockdep_proc.o = -pg -CFLAGS_REMOVE_mutex-debug.o = -pg -CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif obj-y += sched/ +obj-y += locking/ obj-y += power/ obj-y += printk/ obj-y += cpu/ @@ -34,7 +33,6 @@ obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ -obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile new file mode 100644 index 000000000000..fe8bd58b22f8 --- /dev/null +++ b/kernel/locking/Makefile @@ -0,0 +1,9 @@ + +obj-y += mutex.o + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg +endif + +obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c new file mode 100644 index 000000000000..7e3443fe1f48 --- /dev/null +++ b/kernel/locking/mutex-debug.c @@ -0,0 +1,110 @@ +/* + * kernel/mutex-debug.c + * + * Debugging code for mutexes + * + * Started by Ingo Molnar: + * + * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar + * + * lock debugging, locking tree, deadlock detection started by: + * + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Released under the General Public License (GPL). + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mutex-debug.h" + +/* + * Must be called with lock->wait_lock held. + */ +void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) +{ + memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); + waiter->magic = waiter; + INIT_LIST_HEAD(&waiter->list); +} + +void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) +{ + SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); + DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); + DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); +} + +void debug_mutex_free_waiter(struct mutex_waiter *waiter) +{ + DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list)); + memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); +} + +void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct thread_info *ti) +{ + SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + + /* Mark the current thread as blocked on the lock: */ + ti->task->blocked_on = waiter; +} + +void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct thread_info *ti) +{ + DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); + DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); + DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); + ti->task->blocked_on = NULL; + + list_del_init(&waiter->list); + waiter->task = NULL; +} + +void debug_mutex_unlock(struct mutex *lock) +{ + if (unlikely(!debug_locks)) + return; + + DEBUG_LOCKS_WARN_ON(lock->magic != lock); + DEBUG_LOCKS_WARN_ON(lock->owner != current); + DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); + mutex_clear_owner(lock); +} + +void debug_mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + lock->magic = lock; +} + +/*** + * mutex_destroy - mark a mutex unusable + * @lock: the mutex to be destroyed + * + * This function marks the mutex uninitialized, and any subsequent + * use of the mutex is forbidden. The mutex must not be locked when + * this function is called. + */ +void mutex_destroy(struct mutex *lock) +{ + DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock)); + lock->magic = NULL; +} + +EXPORT_SYMBOL_GPL(mutex_destroy); diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h new file mode 100644 index 000000000000..0799fd3e4cfa --- /dev/null +++ b/kernel/locking/mutex-debug.h @@ -0,0 +1,55 @@ +/* + * Mutexes: blocking mutual exclusion locks + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar + * + * This file contains mutex debugging related internal declarations, + * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case. + * More details are in kernel/mutex-debug.c. + */ + +/* + * This must be called with lock->wait_lock held. + */ +extern void debug_mutex_lock_common(struct mutex *lock, + struct mutex_waiter *waiter); +extern void debug_mutex_wake_waiter(struct mutex *lock, + struct mutex_waiter *waiter); +extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); +extern void debug_mutex_add_waiter(struct mutex *lock, + struct mutex_waiter *waiter, + struct thread_info *ti); +extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct thread_info *ti); +extern void debug_mutex_unlock(struct mutex *lock); +extern void debug_mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key); + +static inline void mutex_set_owner(struct mutex *lock) +{ + lock->owner = current; +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ + lock->owner = NULL; +} + +#define spin_lock_mutex(lock, flags) \ + do { \ + struct mutex *l = container_of(lock, struct mutex, wait_lock); \ + \ + DEBUG_LOCKS_WARN_ON(in_interrupt()); \ + local_irq_save(flags); \ + arch_spin_lock(&(lock)->rlock.raw_lock);\ + DEBUG_LOCKS_WARN_ON(l->magic != l); \ + } while (0) + +#define spin_unlock_mutex(lock, flags) \ + do { \ + arch_spin_unlock(&(lock)->rlock.raw_lock); \ + local_irq_restore(flags); \ + preempt_check_resched(); \ + } while (0) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c new file mode 100644 index 000000000000..d24105b1b794 --- /dev/null +++ b/kernel/locking/mutex.c @@ -0,0 +1,960 @@ +/* + * kernel/mutex.c + * + * Mutexes: blocking mutual exclusion locks + * + * Started by Ingo Molnar: + * + * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar + * + * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and + * David Howells for suggestions and improvements. + * + * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline + * from the -rt tree, where it was originally implemented for rtmutexes + * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale + * and Sven Dietrich. + * + * Also see Documentation/mutex-design.txt. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * In the DEBUG case we are using the "NULL fastpath" for mutexes, + * which forces all calls into the slowpath: + */ +#ifdef CONFIG_DEBUG_MUTEXES +# include "mutex-debug.h" +# include +#else +# include "mutex.h" +# include +#endif + +/* + * A negative mutex count indicates that waiters are sleeping waiting for the + * mutex. + */ +#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0) + +void +__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) +{ + atomic_set(&lock->count, 1); + spin_lock_init(&lock->wait_lock); + INIT_LIST_HEAD(&lock->wait_list); + mutex_clear_owner(lock); +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + lock->spin_mlock = NULL; +#endif + + debug_mutex_init(lock, name, key); +} + +EXPORT_SYMBOL(__mutex_init); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +/* + * We split the mutex lock/unlock logic into separate fastpath and + * slowpath functions, to reduce the register pressure on the fastpath. + * We also put the fastpath first in the kernel image, to make sure the + * branch is predicted by the CPU as default-untaken. + */ +static __used noinline void __sched +__mutex_lock_slowpath(atomic_t *lock_count); + +/** + * mutex_lock - acquire the mutex + * @lock: the mutex to be acquired + * + * Lock the mutex exclusively for this task. If the mutex is not + * available right now, it will sleep until it can get it. + * + * The mutex must later on be released by the same task that + * acquired it. Recursive locking is not allowed. The task + * may not exit without first unlocking the mutex. Also, kernel + * memory where the mutex resides mutex must not be freed with + * the mutex still locked. The mutex must first be initialized + * (or statically defined) before it can be locked. memset()-ing + * the mutex to 0 is not allowed. + * + * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging + * checks that will enforce the restrictions and will also do + * deadlock debugging. ) + * + * This function is similar to (but not equivalent to) down(). + */ +void __sched mutex_lock(struct mutex *lock) +{ + might_sleep(); + /* + * The locking fastpath is the 1->0 transition from + * 'unlocked' into 'locked' state. + */ + __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); + mutex_set_owner(lock); +} + +EXPORT_SYMBOL(mutex_lock); +#endif + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * In order to avoid a stampede of mutex spinners from acquiring the mutex + * more or less simultaneously, the spinners need to acquire a MCS lock + * first before spinning on the owner field. + * + * We don't inline mspin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +struct mspin_node { + struct mspin_node *next ; + int locked; /* 1 if lock acquired */ +}; +#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock)) + +static noinline +void mspin_lock(struct mspin_node **lock, struct mspin_node *node) +{ + struct mspin_node *prev; + + /* Init node */ + node->locked = 0; + node->next = NULL; + + prev = xchg(lock, node); + if (likely(prev == NULL)) { + /* Lock acquired */ + node->locked = 1; + return; + } + ACCESS_ONCE(prev->next) = node; + smp_wmb(); + /* Wait until the lock holder passes the lock down */ + while (!ACCESS_ONCE(node->locked)) + arch_mutex_cpu_relax(); +} + +static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) +{ + struct mspin_node *next = ACCESS_ONCE(node->next); + + if (likely(!next)) { + /* + * Release the lock by setting it to NULL + */ + if (cmpxchg(lock, node, NULL) == node) + return; + /* Wait until the next pointer is set */ + while (!(next = ACCESS_ONCE(node->next))) + arch_mutex_cpu_relax(); + } + ACCESS_ONCE(next->locked) = 1; + smp_wmb(); +} + +/* + * Mutex spinning code migrated from kernel/sched/core.c + */ + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + if (lock->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +static noinline +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(lock, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. + */ + return lock->owner == NULL; +} + +/* + * Initial check for entering the mutex spinning loop + */ +static inline int mutex_can_spin_on_owner(struct mutex *lock) +{ + struct task_struct *owner; + int retval = 1; + + rcu_read_lock(); + owner = ACCESS_ONCE(lock->owner); + if (owner) + retval = owner->on_cpu; + rcu_read_unlock(); + /* + * if lock->owner is not set, the mutex owner may have just acquired + * it and not set the owner yet or the mutex has been released. + */ + return retval; +} +#endif + +static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); + +/** + * mutex_unlock - release the mutex + * @lock: the mutex to be released + * + * Unlock a mutex that has been locked by this task previously. + * + * This function must not be used in interrupt context. Unlocking + * of a not locked mutex is not allowed. + * + * This function is similar to (but not equivalent to) up(). + */ +void __sched mutex_unlock(struct mutex *lock) +{ + /* + * The unlocking fastpath is the 0->1 transition from 'locked' + * into 'unlocked' state: + */ +#ifndef CONFIG_DEBUG_MUTEXES + /* + * When debugging is enabled we must not clear the owner before time, + * the slow path will always be taken, and that clears the owner field + * after verifying that it was indeed current. + */ + mutex_clear_owner(lock); +#endif + __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); +} + +EXPORT_SYMBOL(mutex_unlock); + +/** + * ww_mutex_unlock - release the w/w mutex + * @lock: the mutex to be released + * + * Unlock a mutex that has been locked by this task previously with any of the + * ww_mutex_lock* functions (with or without an acquire context). It is + * forbidden to release the locks after releasing the acquire context. + * + * This function must not be used in interrupt context. Unlocking + * of a unlocked mutex is not allowed. + */ +void __sched ww_mutex_unlock(struct ww_mutex *lock) +{ + /* + * The unlocking fastpath is the 0->1 transition from 'locked' + * into 'unlocked' state: + */ + if (lock->ctx) { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); +#endif + if (lock->ctx->acquired > 0) + lock->ctx->acquired--; + lock->ctx = NULL; + } + +#ifndef CONFIG_DEBUG_MUTEXES + /* + * When debugging is enabled we must not clear the owner before time, + * the slow path will always be taken, and that clears the owner field + * after verifying that it was indeed current. + */ + mutex_clear_owner(&lock->base); +#endif + __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath); +} +EXPORT_SYMBOL(ww_mutex_unlock); + +static inline int __sched +__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) +{ + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); + + if (!hold_ctx) + return 0; + + if (unlikely(ctx == hold_ctx)) + return -EALREADY; + + if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && + (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(ctx->contending_lock); + ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} + +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, + struct ww_acquire_ctx *ww_ctx) +{ +#ifdef CONFIG_DEBUG_MUTEXES + /* + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, + * but released with a normal mutex_unlock in this call. + * + * This should never happen, always use ww_mutex_unlock. + */ + DEBUG_LOCKS_WARN_ON(ww->ctx); + + /* + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); + ww_ctx->contending_lock = NULL; + } + + /* + * Naughty, using a different class will lead to undefined behavior! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif + ww_ctx->acquired++; +} + +/* + * after acquiring lock with fastpath or when we lost out in contested + * slowpath, set ctx and wake up any waiters so they can recheck. + * + * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, + * as the fastpath and opportunistic spinning are disabled in that case. + */ +static __always_inline void +ww_mutex_set_context_fastpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + unsigned long flags; + struct mutex_waiter *cur; + + ww_mutex_lock_acquired(lock, ctx); + + lock->ctx = ctx; + + /* + * The lock->ctx update should be visible on all cores before + * the atomic read is done, otherwise contended waiters might be + * missed. The contended waiters will either see ww_ctx == NULL + * and keep spinning, or it will acquire wait_lock, add itself + * to waiter list and sleep. + */ + smp_mb(); /* ^^^ */ + + /* + * Check if lock is contended, if not there is nobody to wake up + */ + if (likely(atomic_read(&lock->base.count) == 0)) + return; + + /* + * Uh oh, we raced in fastpath, wake up everyone in this case, + * so they can see the new lock->ctx. + */ + spin_lock_mutex(&lock->base.wait_lock, flags); + list_for_each_entry(cur, &lock->base.wait_list, list) { + debug_mutex_wake_waiter(&lock->base, cur); + wake_up_process(cur->task); + } + spin_unlock_mutex(&lock->base.wait_lock, flags); +} + +/* + * Lock a mutex (possibly interruptible), slowpath: + */ +static __always_inline int __sched +__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, + struct lockdep_map *nest_lock, unsigned long ip, + struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) +{ + struct task_struct *task = current; + struct mutex_waiter waiter; + unsigned long flags; + int ret; + + preempt_disable(); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + /* + * Optimistic spinning. + * + * We try to spin for acquisition when we find that there are no + * pending waiters and the lock owner is currently running on a + * (different) CPU. + * + * The rationale is that if the lock owner is running, it is likely to + * release the lock soon. + * + * Since this needs the lock owner, and this mutex implementation + * doesn't track the owner atomically in the lock field, we need to + * track it non-atomically. + * + * We can't do this for DEBUG_MUTEXES because that relies on wait_lock + * to serialize everything. + * + * The mutex spinners are queued up using MCS lock so that only one + * spinner can compete for the mutex. However, if mutex spinning isn't + * going to happen, there is no point in going through the lock/unlock + * overhead. + */ + if (!mutex_can_spin_on_owner(lock)) + goto slowpath; + + for (;;) { + struct task_struct *owner; + struct mspin_node node; + + if (use_ww_ctx && ww_ctx->acquired > 0) { + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base); + /* + * If ww->ctx is set the contents are undefined, only + * by acquiring wait_lock there is a guarantee that + * they are not invalid when reading. + * + * As such, when deadlock detection needs to be + * performed the optimistic spinning cannot be done. + */ + if (ACCESS_ONCE(ww->ctx)) + goto slowpath; + } + + /* + * If there's an owner, wait for it to either + * release the lock or go to sleep. + */ + mspin_lock(MLOCK(lock), &node); + owner = ACCESS_ONCE(lock->owner); + if (owner && !mutex_spin_on_owner(lock, owner)) { + mspin_unlock(MLOCK(lock), &node); + goto slowpath; + } + + if ((atomic_read(&lock->count) == 1) && + (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { + lock_acquired(&lock->dep_map, ip); + if (use_ww_ctx) { + struct ww_mutex *ww; + ww = container_of(lock, struct ww_mutex, base); + + ww_mutex_set_context_fastpath(ww, ww_ctx); + } + + mutex_set_owner(lock); + mspin_unlock(MLOCK(lock), &node); + preempt_enable(); + return 0; + } + mspin_unlock(MLOCK(lock), &node); + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(task))) + goto slowpath; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + arch_mutex_cpu_relax(); + } +slowpath: +#endif + spin_lock_mutex(&lock->wait_lock, flags); + + /* once more, can we acquire the lock? */ + if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) + goto skip_wait; + + debug_mutex_lock_common(lock, &waiter); + debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); + + /* add waiting tasks to the end of the waitqueue (FIFO): */ + list_add_tail(&waiter.list, &lock->wait_list); + waiter.task = task; + + lock_contended(&lock->dep_map, ip); + + for (;;) { + /* + * Lets try to take the lock again - this is needed even if + * we get here for the first time (shortly after failing to + * acquire the lock), to make sure that we get a wakeup once + * it's unlocked. Later on, if we sleep, this is the + * operation that gives us the lock. We xchg it to -1, so + * that when we release the lock, we properly wake up the + * other waiters: + */ + if (MUTEX_SHOW_NO_WAITER(lock) && + (atomic_xchg(&lock->count, -1) == 1)) + break; + + /* + * got a signal? (This code gets eliminated in the + * TASK_UNINTERRUPTIBLE case.) + */ + if (unlikely(signal_pending_state(state, task))) { + ret = -EINTR; + goto err; + } + + if (use_ww_ctx && ww_ctx->acquired > 0) { + ret = __mutex_lock_check_stamp(lock, ww_ctx); + if (ret) + goto err; + } + + __set_task_state(task, state); + + /* didn't get the lock, go to sleep: */ + spin_unlock_mutex(&lock->wait_lock, flags); + schedule_preempt_disabled(); + spin_lock_mutex(&lock->wait_lock, flags); + } + mutex_remove_waiter(lock, &waiter, current_thread_info()); + /* set it to 0 if there are no waiters left: */ + if (likely(list_empty(&lock->wait_list))) + atomic_set(&lock->count, 0); + debug_mutex_free_waiter(&waiter); + +skip_wait: + /* got the lock - cleanup and rejoice! */ + lock_acquired(&lock->dep_map, ip); + mutex_set_owner(lock); + + if (use_ww_ctx) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + struct mutex_waiter *cur; + + /* + * This branch gets optimized out for the common case, + * and is only important for ww_mutex_lock. + */ + ww_mutex_lock_acquired(ww, ww_ctx); + ww->ctx = ww_ctx; + + /* + * Give any possible sleeping processes the chance to wake up, + * so they can recheck if they have to back off. + */ + list_for_each_entry(cur, &lock->wait_list, list) { + debug_mutex_wake_waiter(lock, cur); + wake_up_process(cur->task); + } + } + + spin_unlock_mutex(&lock->wait_lock, flags); + preempt_enable(); + return 0; + +err: + mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + spin_unlock_mutex(&lock->wait_lock, flags); + debug_mutex_free_waiter(&waiter); + mutex_release(&lock->dep_map, 1, ip); + preempt_enable(); + return ret; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched +mutex_lock_nested(struct mutex *lock, unsigned int subclass) +{ + might_sleep(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, + subclass, NULL, _RET_IP_, NULL, 0); +} + +EXPORT_SYMBOL_GPL(mutex_lock_nested); + +void __sched +_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) +{ + might_sleep(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, + 0, nest, _RET_IP_, NULL, 0); +} + +EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); + +int __sched +mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) +{ + might_sleep(); + return __mutex_lock_common(lock, TASK_KILLABLE, + subclass, NULL, _RET_IP_, NULL, 0); +} +EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); + +int __sched +mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) +{ + might_sleep(); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, + subclass, NULL, _RET_IP_, NULL, 0); +} + +EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); + +static inline int +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH + unsigned tmp; + + if (ctx->deadlock_inject_countdown-- == 0) { + tmp = ctx->deadlock_inject_interval; + if (tmp > UINT_MAX/4) + tmp = UINT_MAX; + else + tmp = tmp*2 + tmp + tmp/2; + + ctx->deadlock_inject_interval = tmp; + ctx->deadlock_inject_countdown = tmp; + ctx->contending_lock = lock; + + ww_mutex_unlock(lock); + + return -EDEADLK; + } +#endif + + return 0; +} + +int __sched +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, + 0, &ctx->dep_map, _RET_IP_, ctx, 1); + if (!ret && ctx->acquired > 1) + return ww_mutex_deadlock_injection(lock, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(__ww_mutex_lock); + +int __sched +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, + 0, &ctx->dep_map, _RET_IP_, ctx, 1); + + if (!ret && ctx->acquired > 1) + return ww_mutex_deadlock_injection(lock, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); + +#endif + +/* + * Release the lock, slowpath: + */ +static inline void +__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) +{ + struct mutex *lock = container_of(lock_count, struct mutex, count); + unsigned long flags; + + spin_lock_mutex(&lock->wait_lock, flags); + mutex_release(&lock->dep_map, nested, _RET_IP_); + debug_mutex_unlock(lock); + + /* + * some architectures leave the lock unlocked in the fastpath failure + * case, others need to leave it locked. In the later case we have to + * unlock it here + */ + if (__mutex_slowpath_needs_to_unlock()) + atomic_set(&lock->count, 1); + + if (!list_empty(&lock->wait_list)) { + /* get the first entry from the wait-list: */ + struct mutex_waiter *waiter = + list_entry(lock->wait_list.next, + struct mutex_waiter, list); + + debug_mutex_wake_waiter(lock, waiter); + + wake_up_process(waiter->task); + } + + spin_unlock_mutex(&lock->wait_lock, flags); +} + +/* + * Release the lock, slowpath: + */ +static __used noinline void +__mutex_unlock_slowpath(atomic_t *lock_count) +{ + __mutex_unlock_common_slowpath(lock_count, 1); +} + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +/* + * Here come the less common (and hence less performance-critical) APIs: + * mutex_lock_interruptible() and mutex_trylock(). + */ +static noinline int __sched +__mutex_lock_killable_slowpath(struct mutex *lock); + +static noinline int __sched +__mutex_lock_interruptible_slowpath(struct mutex *lock); + +/** + * mutex_lock_interruptible - acquire the mutex, interruptible + * @lock: the mutex to be acquired + * + * Lock the mutex like mutex_lock(), and return 0 if the mutex has + * been acquired or sleep until the mutex becomes available. If a + * signal arrives while waiting for the lock then this function + * returns -EINTR. + * + * This function is similar to (but not equivalent to) down_interruptible(). + */ +int __sched mutex_lock_interruptible(struct mutex *lock) +{ + int ret; + + might_sleep(); + ret = __mutex_fastpath_lock_retval(&lock->count); + if (likely(!ret)) { + mutex_set_owner(lock); + return 0; + } else + return __mutex_lock_interruptible_slowpath(lock); +} + +EXPORT_SYMBOL(mutex_lock_interruptible); + +int __sched mutex_lock_killable(struct mutex *lock) +{ + int ret; + + might_sleep(); + ret = __mutex_fastpath_lock_retval(&lock->count); + if (likely(!ret)) { + mutex_set_owner(lock); + return 0; + } else + return __mutex_lock_killable_slowpath(lock); +} +EXPORT_SYMBOL(mutex_lock_killable); + +static __used noinline void __sched +__mutex_lock_slowpath(atomic_t *lock_count) +{ + struct mutex *lock = container_of(lock_count, struct mutex, count); + + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, + NULL, _RET_IP_, NULL, 0); +} + +static noinline int __sched +__mutex_lock_killable_slowpath(struct mutex *lock) +{ + return __mutex_lock_common(lock, TASK_KILLABLE, 0, + NULL, _RET_IP_, NULL, 0); +} + +static noinline int __sched +__mutex_lock_interruptible_slowpath(struct mutex *lock) +{ + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, + NULL, _RET_IP_, NULL, 0); +} + +static noinline int __sched +__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, + NULL, _RET_IP_, ctx, 1); +} + +static noinline int __sched +__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, + struct ww_acquire_ctx *ctx) +{ + return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, + NULL, _RET_IP_, ctx, 1); +} + +#endif + +/* + * Spinlock based trylock, we take the spinlock and check whether we + * can get the lock: + */ +static inline int __mutex_trylock_slowpath(atomic_t *lock_count) +{ + struct mutex *lock = container_of(lock_count, struct mutex, count); + unsigned long flags; + int prev; + + spin_lock_mutex(&lock->wait_lock, flags); + + prev = atomic_xchg(&lock->count, -1); + if (likely(prev == 1)) { + mutex_set_owner(lock); + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + } + + /* Set it back to 0 if there are no waiters: */ + if (likely(list_empty(&lock->wait_list))) + atomic_set(&lock->count, 0); + + spin_unlock_mutex(&lock->wait_lock, flags); + + return prev == 1; +} + +/** + * mutex_trylock - try to acquire the mutex, without waiting + * @lock: the mutex to be acquired + * + * Try to acquire the mutex atomically. Returns 1 if the mutex + * has been acquired successfully, and 0 on contention. + * + * NOTE: this function follows the spin_trylock() convention, so + * it is negated from the down_trylock() return values! Be careful + * about this when converting semaphore users to mutexes. + * + * This function must not be used in interrupt context. The + * mutex must be released by the same task that acquired it. + */ +int __sched mutex_trylock(struct mutex *lock) +{ + int ret; + + ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); + if (ret) + mutex_set_owner(lock); + + return ret; +} +EXPORT_SYMBOL(mutex_trylock); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +int __sched +__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + + ret = __mutex_fastpath_lock_retval(&lock->base.count); + + if (likely(!ret)) { + ww_mutex_set_context_fastpath(lock, ctx); + mutex_set_owner(&lock->base); + } else + ret = __ww_mutex_lock_slowpath(lock, ctx); + return ret; +} +EXPORT_SYMBOL(__ww_mutex_lock); + +int __sched +__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + + ret = __mutex_fastpath_lock_retval(&lock->base.count); + + if (likely(!ret)) { + ww_mutex_set_context_fastpath(lock, ctx); + mutex_set_owner(&lock->base); + } else + ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx); + return ret; +} +EXPORT_SYMBOL(__ww_mutex_lock_interruptible); + +#endif + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h new file mode 100644 index 000000000000..4115fbf83b12 --- /dev/null +++ b/kernel/locking/mutex.h @@ -0,0 +1,48 @@ +/* + * Mutexes: blocking mutual exclusion locks + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar + * + * This file contains mutex debugging related internal prototypes, for the + * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: + */ + +#define spin_lock_mutex(lock, flags) \ + do { spin_lock(lock); (void)(flags); } while (0) +#define spin_unlock_mutex(lock, flags) \ + do { spin_unlock(lock); (void)(flags); } while (0) +#define mutex_remove_waiter(lock, waiter, ti) \ + __list_del((waiter)->list.prev, (waiter)->list.next) + +#ifdef CONFIG_SMP +static inline void mutex_set_owner(struct mutex *lock) +{ + lock->owner = current; +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ + lock->owner = NULL; +} +#else +static inline void mutex_set_owner(struct mutex *lock) +{ +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ +} +#endif + +#define debug_mutex_wake_waiter(lock, waiter) do { } while (0) +#define debug_mutex_free_waiter(waiter) do { } while (0) +#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) +#define debug_mutex_unlock(lock) do { } while (0) +#define debug_mutex_init(lock, name, key) do { } while (0) + +static inline void +debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) +{ +} diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c deleted file mode 100644 index 7e3443fe1f48..000000000000 --- a/kernel/mutex-debug.c +++ /dev/null @@ -1,110 +0,0 @@ -/* - * kernel/mutex-debug.c - * - * Debugging code for mutexes - * - * Started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar - * - * lock debugging, locking tree, deadlock detection started by: - * - * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey - * Released under the General Public License (GPL). - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "mutex-debug.h" - -/* - * Must be called with lock->wait_lock held. - */ -void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) -{ - memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); - waiter->magic = waiter; - INIT_LIST_HEAD(&waiter->list); -} - -void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) -{ - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); - DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); - DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); -} - -void debug_mutex_free_waiter(struct mutex_waiter *waiter) -{ - DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list)); - memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); -} - -void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) -{ - SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); - - /* Mark the current thread as blocked on the lock: */ - ti->task->blocked_on = waiter; -} - -void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) -{ - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); - DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); - DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); - ti->task->blocked_on = NULL; - - list_del_init(&waiter->list); - waiter->task = NULL; -} - -void debug_mutex_unlock(struct mutex *lock) -{ - if (unlikely(!debug_locks)) - return; - - DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current); - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); - mutex_clear_owner(lock); -} - -void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - lock->magic = lock; -} - -/*** - * mutex_destroy - mark a mutex unusable - * @lock: the mutex to be destroyed - * - * This function marks the mutex uninitialized, and any subsequent - * use of the mutex is forbidden. The mutex must not be locked when - * this function is called. - */ -void mutex_destroy(struct mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock)); - lock->magic = NULL; -} - -EXPORT_SYMBOL_GPL(mutex_destroy); diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h deleted file mode 100644 index 0799fd3e4cfa..000000000000 --- a/kernel/mutex-debug.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Mutexes: blocking mutual exclusion locks - * - * started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar - * - * This file contains mutex debugging related internal declarations, - * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case. - * More details are in kernel/mutex-debug.c. - */ - -/* - * This must be called with lock->wait_lock held. - */ -extern void debug_mutex_lock_common(struct mutex *lock, - struct mutex_waiter *waiter); -extern void debug_mutex_wake_waiter(struct mutex *lock, - struct mutex_waiter *waiter); -extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); -extern void debug_mutex_add_waiter(struct mutex *lock, - struct mutex_waiter *waiter, - struct thread_info *ti); -extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); -extern void debug_mutex_unlock(struct mutex *lock); -extern void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); - -static inline void mutex_set_owner(struct mutex *lock) -{ - lock->owner = current; -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ - lock->owner = NULL; -} - -#define spin_lock_mutex(lock, flags) \ - do { \ - struct mutex *l = container_of(lock, struct mutex, wait_lock); \ - \ - DEBUG_LOCKS_WARN_ON(in_interrupt()); \ - local_irq_save(flags); \ - arch_spin_lock(&(lock)->rlock.raw_lock);\ - DEBUG_LOCKS_WARN_ON(l->magic != l); \ - } while (0) - -#define spin_unlock_mutex(lock, flags) \ - do { \ - arch_spin_unlock(&(lock)->rlock.raw_lock); \ - local_irq_restore(flags); \ - preempt_check_resched(); \ - } while (0) diff --git a/kernel/mutex.c b/kernel/mutex.c deleted file mode 100644 index d24105b1b794..000000000000 --- a/kernel/mutex.c +++ /dev/null @@ -1,960 +0,0 @@ -/* - * kernel/mutex.c - * - * Mutexes: blocking mutual exclusion locks - * - * Started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar - * - * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and - * David Howells for suggestions and improvements. - * - * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline - * from the -rt tree, where it was originally implemented for rtmutexes - * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale - * and Sven Dietrich. - * - * Also see Documentation/mutex-design.txt. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * In the DEBUG case we are using the "NULL fastpath" for mutexes, - * which forces all calls into the slowpath: - */ -#ifdef CONFIG_DEBUG_MUTEXES -# include "mutex-debug.h" -# include -#else -# include "mutex.h" -# include -#endif - -/* - * A negative mutex count indicates that waiters are sleeping waiting for the - * mutex. - */ -#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0) - -void -__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) -{ - atomic_set(&lock->count, 1); - spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); - mutex_clear_owner(lock); -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - lock->spin_mlock = NULL; -#endif - - debug_mutex_init(lock, name, key); -} - -EXPORT_SYMBOL(__mutex_init); - -#ifndef CONFIG_DEBUG_LOCK_ALLOC -/* - * We split the mutex lock/unlock logic into separate fastpath and - * slowpath functions, to reduce the register pressure on the fastpath. - * We also put the fastpath first in the kernel image, to make sure the - * branch is predicted by the CPU as default-untaken. - */ -static __used noinline void __sched -__mutex_lock_slowpath(atomic_t *lock_count); - -/** - * mutex_lock - acquire the mutex - * @lock: the mutex to be acquired - * - * Lock the mutex exclusively for this task. If the mutex is not - * available right now, it will sleep until it can get it. - * - * The mutex must later on be released by the same task that - * acquired it. Recursive locking is not allowed. The task - * may not exit without first unlocking the mutex. Also, kernel - * memory where the mutex resides mutex must not be freed with - * the mutex still locked. The mutex must first be initialized - * (or statically defined) before it can be locked. memset()-ing - * the mutex to 0 is not allowed. - * - * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging - * checks that will enforce the restrictions and will also do - * deadlock debugging. ) - * - * This function is similar to (but not equivalent to) down(). - */ -void __sched mutex_lock(struct mutex *lock) -{ - might_sleep(); - /* - * The locking fastpath is the 1->0 transition from - * 'unlocked' into 'locked' state. - */ - __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); - mutex_set_owner(lock); -} - -EXPORT_SYMBOL(mutex_lock); -#endif - -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * In order to avoid a stampede of mutex spinners from acquiring the mutex - * more or less simultaneously, the spinners need to acquire a MCS lock - * first before spinning on the owner field. - * - * We don't inline mspin_lock() so that perf can correctly account for the - * time spent in this lock function. - */ -struct mspin_node { - struct mspin_node *next ; - int locked; /* 1 if lock acquired */ -}; -#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock)) - -static noinline -void mspin_lock(struct mspin_node **lock, struct mspin_node *node) -{ - struct mspin_node *prev; - - /* Init node */ - node->locked = 0; - node->next = NULL; - - prev = xchg(lock, node); - if (likely(prev == NULL)) { - /* Lock acquired */ - node->locked = 1; - return; - } - ACCESS_ONCE(prev->next) = node; - smp_wmb(); - /* Wait until the lock holder passes the lock down */ - while (!ACCESS_ONCE(node->locked)) - arch_mutex_cpu_relax(); -} - -static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) -{ - struct mspin_node *next = ACCESS_ONCE(node->next); - - if (likely(!next)) { - /* - * Release the lock by setting it to NULL - */ - if (cmpxchg(lock, node, NULL) == node) - return; - /* Wait until the next pointer is set */ - while (!(next = ACCESS_ONCE(node->next))) - arch_mutex_cpu_relax(); - } - ACCESS_ONCE(next->locked) = 1; - smp_wmb(); -} - -/* - * Mutex spinning code migrated from kernel/sched/core.c - */ - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ - if (lock->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); - - return owner->on_cpu; -} - -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -static noinline -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) -{ - rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) - break; - - arch_mutex_cpu_relax(); - } - rcu_read_unlock(); - - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; -} - -/* - * Initial check for entering the mutex spinning loop - */ -static inline int mutex_can_spin_on_owner(struct mutex *lock) -{ - struct task_struct *owner; - int retval = 1; - - rcu_read_lock(); - owner = ACCESS_ONCE(lock->owner); - if (owner) - retval = owner->on_cpu; - rcu_read_unlock(); - /* - * if lock->owner is not set, the mutex owner may have just acquired - * it and not set the owner yet or the mutex has been released. - */ - return retval; -} -#endif - -static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); - -/** - * mutex_unlock - release the mutex - * @lock: the mutex to be released - * - * Unlock a mutex that has been locked by this task previously. - * - * This function must not be used in interrupt context. Unlocking - * of a not locked mutex is not allowed. - * - * This function is similar to (but not equivalent to) up(). - */ -void __sched mutex_unlock(struct mutex *lock) -{ - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ -#ifndef CONFIG_DEBUG_MUTEXES - /* - * When debugging is enabled we must not clear the owner before time, - * the slow path will always be taken, and that clears the owner field - * after verifying that it was indeed current. - */ - mutex_clear_owner(lock); -#endif - __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); -} - -EXPORT_SYMBOL(mutex_unlock); - -/** - * ww_mutex_unlock - release the w/w mutex - * @lock: the mutex to be released - * - * Unlock a mutex that has been locked by this task previously with any of the - * ww_mutex_lock* functions (with or without an acquire context). It is - * forbidden to release the locks after releasing the acquire context. - * - * This function must not be used in interrupt context. Unlocking - * of a unlocked mutex is not allowed. - */ -void __sched ww_mutex_unlock(struct ww_mutex *lock) -{ - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ - if (lock->ctx) { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); -#endif - if (lock->ctx->acquired > 0) - lock->ctx->acquired--; - lock->ctx = NULL; - } - -#ifndef CONFIG_DEBUG_MUTEXES - /* - * When debugging is enabled we must not clear the owner before time, - * the slow path will always be taken, and that clears the owner field - * after verifying that it was indeed current. - */ - mutex_clear_owner(&lock->base); -#endif - __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath); -} -EXPORT_SYMBOL(ww_mutex_unlock); - -static inline int __sched -__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) -{ - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); - - if (!hold_ctx) - return 0; - - if (unlikely(ctx == hold_ctx)) - return -EALREADY; - - if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && - (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { -#ifdef CONFIG_DEBUG_MUTEXES - DEBUG_LOCKS_WARN_ON(ctx->contending_lock); - ctx->contending_lock = ww; -#endif - return -EDEADLK; - } - - return 0; -} - -static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, - struct ww_acquire_ctx *ww_ctx) -{ -#ifdef CONFIG_DEBUG_MUTEXES - /* - * If this WARN_ON triggers, you used ww_mutex_lock to acquire, - * but released with a normal mutex_unlock in this call. - * - * This should never happen, always use ww_mutex_unlock. - */ - DEBUG_LOCKS_WARN_ON(ww->ctx); - - /* - * Not quite done after calling ww_acquire_done() ? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); - - if (ww_ctx->contending_lock) { - /* - * After -EDEADLK you tried to - * acquire a different ww_mutex? Bad! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); - - /* - * You called ww_mutex_lock after receiving -EDEADLK, - * but 'forgot' to unlock everything else first? - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); - ww_ctx->contending_lock = NULL; - } - - /* - * Naughty, using a different class will lead to undefined behavior! - */ - DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); -#endif - ww_ctx->acquired++; -} - -/* - * after acquiring lock with fastpath or when we lost out in contested - * slowpath, set ctx and wake up any waiters so they can recheck. - * - * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, - * as the fastpath and opportunistic spinning are disabled in that case. - */ -static __always_inline void -ww_mutex_set_context_fastpath(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) -{ - unsigned long flags; - struct mutex_waiter *cur; - - ww_mutex_lock_acquired(lock, ctx); - - lock->ctx = ctx; - - /* - * The lock->ctx update should be visible on all cores before - * the atomic read is done, otherwise contended waiters might be - * missed. The contended waiters will either see ww_ctx == NULL - * and keep spinning, or it will acquire wait_lock, add itself - * to waiter list and sleep. - */ - smp_mb(); /* ^^^ */ - - /* - * Check if lock is contended, if not there is nobody to wake up - */ - if (likely(atomic_read(&lock->base.count) == 0)) - return; - - /* - * Uh oh, we raced in fastpath, wake up everyone in this case, - * so they can see the new lock->ctx. - */ - spin_lock_mutex(&lock->base.wait_lock, flags); - list_for_each_entry(cur, &lock->base.wait_list, list) { - debug_mutex_wake_waiter(&lock->base, cur); - wake_up_process(cur->task); - } - spin_unlock_mutex(&lock->base.wait_lock, flags); -} - -/* - * Lock a mutex (possibly interruptible), slowpath: - */ -static __always_inline int __sched -__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, - struct lockdep_map *nest_lock, unsigned long ip, - struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) -{ - struct task_struct *task = current; - struct mutex_waiter waiter; - unsigned long flags; - int ret; - - preempt_disable(); - mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); - -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - /* - * Optimistic spinning. - * - * We try to spin for acquisition when we find that there are no - * pending waiters and the lock owner is currently running on a - * (different) CPU. - * - * The rationale is that if the lock owner is running, it is likely to - * release the lock soon. - * - * Since this needs the lock owner, and this mutex implementation - * doesn't track the owner atomically in the lock field, we need to - * track it non-atomically. - * - * We can't do this for DEBUG_MUTEXES because that relies on wait_lock - * to serialize everything. - * - * The mutex spinners are queued up using MCS lock so that only one - * spinner can compete for the mutex. However, if mutex spinning isn't - * going to happen, there is no point in going through the lock/unlock - * overhead. - */ - if (!mutex_can_spin_on_owner(lock)) - goto slowpath; - - for (;;) { - struct task_struct *owner; - struct mspin_node node; - - if (use_ww_ctx && ww_ctx->acquired > 0) { - struct ww_mutex *ww; - - ww = container_of(lock, struct ww_mutex, base); - /* - * If ww->ctx is set the contents are undefined, only - * by acquiring wait_lock there is a guarantee that - * they are not invalid when reading. - * - * As such, when deadlock detection needs to be - * performed the optimistic spinning cannot be done. - */ - if (ACCESS_ONCE(ww->ctx)) - goto slowpath; - } - - /* - * If there's an owner, wait for it to either - * release the lock or go to sleep. - */ - mspin_lock(MLOCK(lock), &node); - owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) { - mspin_unlock(MLOCK(lock), &node); - goto slowpath; - } - - if ((atomic_read(&lock->count) == 1) && - (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { - lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) { - struct ww_mutex *ww; - ww = container_of(lock, struct ww_mutex, base); - - ww_mutex_set_context_fastpath(ww, ww_ctx); - } - - mutex_set_owner(lock); - mspin_unlock(MLOCK(lock), &node); - preempt_enable(); - return 0; - } - mspin_unlock(MLOCK(lock), &node); - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) - goto slowpath; - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - arch_mutex_cpu_relax(); - } -slowpath: -#endif - spin_lock_mutex(&lock->wait_lock, flags); - - /* once more, can we acquire the lock? */ - if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) - goto skip_wait; - - debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); - - /* add waiting tasks to the end of the waitqueue (FIFO): */ - list_add_tail(&waiter.list, &lock->wait_list); - waiter.task = task; - - lock_contended(&lock->dep_map, ip); - - for (;;) { - /* - * Lets try to take the lock again - this is needed even if - * we get here for the first time (shortly after failing to - * acquire the lock), to make sure that we get a wakeup once - * it's unlocked. Later on, if we sleep, this is the - * operation that gives us the lock. We xchg it to -1, so - * that when we release the lock, we properly wake up the - * other waiters: - */ - if (MUTEX_SHOW_NO_WAITER(lock) && - (atomic_xchg(&lock->count, -1) == 1)) - break; - - /* - * got a signal? (This code gets eliminated in the - * TASK_UNINTERRUPTIBLE case.) - */ - if (unlikely(signal_pending_state(state, task))) { - ret = -EINTR; - goto err; - } - - if (use_ww_ctx && ww_ctx->acquired > 0) { - ret = __mutex_lock_check_stamp(lock, ww_ctx); - if (ret) - goto err; - } - - __set_task_state(task, state); - - /* didn't get the lock, go to sleep: */ - spin_unlock_mutex(&lock->wait_lock, flags); - schedule_preempt_disabled(); - spin_lock_mutex(&lock->wait_lock, flags); - } - mutex_remove_waiter(lock, &waiter, current_thread_info()); - /* set it to 0 if there are no waiters left: */ - if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); - debug_mutex_free_waiter(&waiter); - -skip_wait: - /* got the lock - cleanup and rejoice! */ - lock_acquired(&lock->dep_map, ip); - mutex_set_owner(lock); - - if (use_ww_ctx) { - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct mutex_waiter *cur; - - /* - * This branch gets optimized out for the common case, - * and is only important for ww_mutex_lock. - */ - ww_mutex_lock_acquired(ww, ww_ctx); - ww->ctx = ww_ctx; - - /* - * Give any possible sleeping processes the chance to wake up, - * so they can recheck if they have to back off. - */ - list_for_each_entry(cur, &lock->wait_list, list) { - debug_mutex_wake_waiter(lock, cur); - wake_up_process(cur->task); - } - } - - spin_unlock_mutex(&lock->wait_lock, flags); - preempt_enable(); - return 0; - -err: - mutex_remove_waiter(lock, &waiter, task_thread_info(task)); - spin_unlock_mutex(&lock->wait_lock, flags); - debug_mutex_free_waiter(&waiter); - mutex_release(&lock->dep_map, 1, ip); - preempt_enable(); - return ret; -} - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void __sched -mutex_lock_nested(struct mutex *lock, unsigned int subclass) -{ - might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL, 0); -} - -EXPORT_SYMBOL_GPL(mutex_lock_nested); - -void __sched -_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) -{ - might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - 0, nest, _RET_IP_, NULL, 0); -} - -EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); - -int __sched -mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) -{ - might_sleep(); - return __mutex_lock_common(lock, TASK_KILLABLE, - subclass, NULL, _RET_IP_, NULL, 0); -} -EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); - -int __sched -mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) -{ - might_sleep(); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL, 0); -} - -EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); - -static inline int -ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ -#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH - unsigned tmp; - - if (ctx->deadlock_inject_countdown-- == 0) { - tmp = ctx->deadlock_inject_interval; - if (tmp > UINT_MAX/4) - tmp = UINT_MAX; - else - tmp = tmp*2 + tmp + tmp/2; - - ctx->deadlock_inject_interval = tmp; - ctx->deadlock_inject_countdown = tmp; - ctx->contending_lock = lock; - - ww_mutex_unlock(lock); - - return -EDEADLK; - } -#endif - - return 0; -} - -int __sched -__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - int ret; - - might_sleep(); - ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx, 1); - if (!ret && ctx->acquired > 1) - return ww_mutex_deadlock_injection(lock, ctx); - - return ret; -} -EXPORT_SYMBOL_GPL(__ww_mutex_lock); - -int __sched -__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - int ret; - - might_sleep(); - ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx, 1); - - if (!ret && ctx->acquired > 1) - return ww_mutex_deadlock_injection(lock, ctx); - - return ret; -} -EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); - -#endif - -/* - * Release the lock, slowpath: - */ -static inline void -__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - unsigned long flags; - - spin_lock_mutex(&lock->wait_lock, flags); - mutex_release(&lock->dep_map, nested, _RET_IP_); - debug_mutex_unlock(lock); - - /* - * some architectures leave the lock unlocked in the fastpath failure - * case, others need to leave it locked. In the later case we have to - * unlock it here - */ - if (__mutex_slowpath_needs_to_unlock()) - atomic_set(&lock->count, 1); - - if (!list_empty(&lock->wait_list)) { - /* get the first entry from the wait-list: */ - struct mutex_waiter *waiter = - list_entry(lock->wait_list.next, - struct mutex_waiter, list); - - debug_mutex_wake_waiter(lock, waiter); - - wake_up_process(waiter->task); - } - - spin_unlock_mutex(&lock->wait_lock, flags); -} - -/* - * Release the lock, slowpath: - */ -static __used noinline void -__mutex_unlock_slowpath(atomic_t *lock_count) -{ - __mutex_unlock_common_slowpath(lock_count, 1); -} - -#ifndef CONFIG_DEBUG_LOCK_ALLOC -/* - * Here come the less common (and hence less performance-critical) APIs: - * mutex_lock_interruptible() and mutex_trylock(). - */ -static noinline int __sched -__mutex_lock_killable_slowpath(struct mutex *lock); - -static noinline int __sched -__mutex_lock_interruptible_slowpath(struct mutex *lock); - -/** - * mutex_lock_interruptible - acquire the mutex, interruptible - * @lock: the mutex to be acquired - * - * Lock the mutex like mutex_lock(), and return 0 if the mutex has - * been acquired or sleep until the mutex becomes available. If a - * signal arrives while waiting for the lock then this function - * returns -EINTR. - * - * This function is similar to (but not equivalent to) down_interruptible(). - */ -int __sched mutex_lock_interruptible(struct mutex *lock) -{ - int ret; - - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->count); - if (likely(!ret)) { - mutex_set_owner(lock); - return 0; - } else - return __mutex_lock_interruptible_slowpath(lock); -} - -EXPORT_SYMBOL(mutex_lock_interruptible); - -int __sched mutex_lock_killable(struct mutex *lock) -{ - int ret; - - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->count); - if (likely(!ret)) { - mutex_set_owner(lock); - return 0; - } else - return __mutex_lock_killable_slowpath(lock); -} -EXPORT_SYMBOL(mutex_lock_killable); - -static __used noinline void __sched -__mutex_lock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL, 0); -} - -static noinline int __sched -__mutex_lock_killable_slowpath(struct mutex *lock) -{ - return __mutex_lock_common(lock, TASK_KILLABLE, 0, - NULL, _RET_IP_, NULL, 0); -} - -static noinline int __sched -__mutex_lock_interruptible_slowpath(struct mutex *lock) -{ - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL, 0); -} - -static noinline int __sched -__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx, 1); -} - -static noinline int __sched -__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, - struct ww_acquire_ctx *ctx) -{ - return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx, 1); -} - -#endif - -/* - * Spinlock based trylock, we take the spinlock and check whether we - * can get the lock: - */ -static inline int __mutex_trylock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - unsigned long flags; - int prev; - - spin_lock_mutex(&lock->wait_lock, flags); - - prev = atomic_xchg(&lock->count, -1); - if (likely(prev == 1)) { - mutex_set_owner(lock); - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - } - - /* Set it back to 0 if there are no waiters: */ - if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); - - spin_unlock_mutex(&lock->wait_lock, flags); - - return prev == 1; -} - -/** - * mutex_trylock - try to acquire the mutex, without waiting - * @lock: the mutex to be acquired - * - * Try to acquire the mutex atomically. Returns 1 if the mutex - * has been acquired successfully, and 0 on contention. - * - * NOTE: this function follows the spin_trylock() convention, so - * it is negated from the down_trylock() return values! Be careful - * about this when converting semaphore users to mutexes. - * - * This function must not be used in interrupt context. The - * mutex must be released by the same task that acquired it. - */ -int __sched mutex_trylock(struct mutex *lock) -{ - int ret; - - ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); - if (ret) - mutex_set_owner(lock); - - return ret; -} -EXPORT_SYMBOL(mutex_trylock); - -#ifndef CONFIG_DEBUG_LOCK_ALLOC -int __sched -__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - int ret; - - might_sleep(); - - ret = __mutex_fastpath_lock_retval(&lock->base.count); - - if (likely(!ret)) { - ww_mutex_set_context_fastpath(lock, ctx); - mutex_set_owner(&lock->base); - } else - ret = __ww_mutex_lock_slowpath(lock, ctx); - return ret; -} -EXPORT_SYMBOL(__ww_mutex_lock); - -int __sched -__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) -{ - int ret; - - might_sleep(); - - ret = __mutex_fastpath_lock_retval(&lock->base.count); - - if (likely(!ret)) { - ww_mutex_set_context_fastpath(lock, ctx); - mutex_set_owner(&lock->base); - } else - ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx); - return ret; -} -EXPORT_SYMBOL(__ww_mutex_lock_interruptible); - -#endif - -/** - * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 - * @cnt: the atomic which we are to dec - * @lock: the mutex to return holding if we dec to 0 - * - * return true and hold lock if we dec to 0, return false otherwise - */ -int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) -{ - /* dec if we can't possibly hit 0 */ - if (atomic_add_unless(cnt, -1, 1)) - return 0; - /* we might hit 0, so take the lock */ - mutex_lock(lock); - if (!atomic_dec_and_test(cnt)) { - /* when we actually did the dec, we didn't hit 0 */ - mutex_unlock(lock); - return 0; - } - /* we hit 0, and we hold the lock */ - return 1; -} -EXPORT_SYMBOL(atomic_dec_and_mutex_lock); diff --git a/kernel/mutex.h b/kernel/mutex.h deleted file mode 100644 index 4115fbf83b12..000000000000 --- a/kernel/mutex.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Mutexes: blocking mutual exclusion locks - * - * started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar - * - * This file contains mutex debugging related internal prototypes, for the - * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: - */ - -#define spin_lock_mutex(lock, flags) \ - do { spin_lock(lock); (void)(flags); } while (0) -#define spin_unlock_mutex(lock, flags) \ - do { spin_unlock(lock); (void)(flags); } while (0) -#define mutex_remove_waiter(lock, waiter, ti) \ - __list_del((waiter)->list.prev, (waiter)->list.next) - -#ifdef CONFIG_SMP -static inline void mutex_set_owner(struct mutex *lock) -{ - lock->owner = current; -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ - lock->owner = NULL; -} -#else -static inline void mutex_set_owner(struct mutex *lock) -{ -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ -} -#endif - -#define debug_mutex_wake_waiter(lock, waiter) do { } while (0) -#define debug_mutex_free_waiter(waiter) do { } while (0) -#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) -#define debug_mutex_unlock(lock) do { } while (0) -#define debug_mutex_init(lock, name, key) do { } while (0) - -static inline void -debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) -{ -} -- cgit v1.3 From 8eddac3f103736163f49255bcb109edadea167f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 18:14:17 +0100 Subject: locking: Move the lockdep code to kernel/locking/ Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-wl7s3tta5isufzfguc23et06@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/Makefile | 6 - kernel/lockdep.c | 4257 ------------------------------------ kernel/lockdep_internals.h | 170 -- kernel/lockdep_proc.c | 683 ------ kernel/lockdep_states.h | 9 - kernel/locking/Makefile | 6 + kernel/locking/lockdep.c | 4257 ++++++++++++++++++++++++++++++++++++ kernel/locking/lockdep_internals.h | 170 ++ kernel/locking/lockdep_proc.c | 683 ++++++ kernel/locking/lockdep_states.h | 9 + 10 files changed, 5125 insertions(+), 5125 deletions(-) delete mode 100644 kernel/lockdep.c delete mode 100644 kernel/lockdep_internals.h delete mode 100644 kernel/lockdep_proc.c delete mode 100644 kernel/lockdep_states.h create mode 100644 kernel/locking/lockdep.c create mode 100644 kernel/locking/lockdep_internals.h create mode 100644 kernel/locking/lockdep_proc.c create mode 100644 kernel/locking/lockdep_states.h (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index 330b14666475..4fffd6ee42c1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -14,8 +14,6 @@ obj-y = fork.o exec_domain.o panic.o \ ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_lockdep.o = -pg -CFLAGS_REMOVE_lockdep_proc.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif @@ -33,10 +31,6 @@ obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ -obj-$(CONFIG_LOCKDEP) += lockdep.o -ifeq ($(CONFIG_PROC_FS),y) -obj-$(CONFIG_LOCKDEP) += lockdep_proc.o -endif obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o diff --git a/kernel/lockdep.c b/kernel/lockdep.c deleted file mode 100644 index 4e8e14c34e42..000000000000 --- a/kernel/lockdep.c +++ /dev/null @@ -1,4257 +0,0 @@ -/* - * kernel/lockdep.c - * - * Runtime locking correctness validator - * - * Started by Ingo Molnar: - * - * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * this code maps all the lock dependencies as they occur in a live kernel - * and will warn about the following classes of locking bugs: - * - * - lock inversion scenarios - * - circular lock dependencies - * - hardirq/softirq safe/unsafe locking bugs - * - * Bugs are reported even if the current locking scenario does not cause - * any deadlock at this point. - * - * I.e. if anytime in the past two locks were taken in a different order, - * even if it happened for another task, even if those were different - * locks (but of the same class as this lock), this code will detect it. - * - * Thanks to Arjan van de Ven for coming up with the initial idea of - * mapping lock dependencies runtime. - */ -#define DISABLE_BRANCH_PROFILING -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "lockdep_internals.h" - -#define CREATE_TRACE_POINTS -#include - -#ifdef CONFIG_PROVE_LOCKING -int prove_locking = 1; -module_param(prove_locking, int, 0644); -#else -#define prove_locking 0 -#endif - -#ifdef CONFIG_LOCK_STAT -int lock_stat = 1; -module_param(lock_stat, int, 0644); -#else -#define lock_stat 0 -#endif - -/* - * lockdep_lock: protects the lockdep graph, the hashes and the - * class/list/hash allocators. - * - * This is one of the rare exceptions where it's justified - * to use a raw spinlock - we really dont want the spinlock - * code to recurse back into the lockdep code... - */ -static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - -static int graph_lock(void) -{ - arch_spin_lock(&lockdep_lock); - /* - * Make sure that if another CPU detected a bug while - * walking the graph we dont change it (while the other - * CPU is busy printing out stuff with the graph lock - * dropped already) - */ - if (!debug_locks) { - arch_spin_unlock(&lockdep_lock); - return 0; - } - /* prevent any recursions within lockdep from causing deadlocks */ - current->lockdep_recursion++; - return 1; -} - -static inline int graph_unlock(void) -{ - if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { - /* - * The lockdep graph lock isn't locked while we expect it to - * be, we're confused now, bye! - */ - return DEBUG_LOCKS_WARN_ON(1); - } - - current->lockdep_recursion--; - arch_spin_unlock(&lockdep_lock); - return 0; -} - -/* - * Turn lock debugging off and return with 0 if it was off already, - * and also release the graph lock: - */ -static inline int debug_locks_off_graph_unlock(void) -{ - int ret = debug_locks_off(); - - arch_spin_unlock(&lockdep_lock); - - return ret; -} - -static int lockdep_initialized; - -unsigned long nr_list_entries; -static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; - -/* - * All data structures here are protected by the global debug_lock. - * - * Mutex key structs only get allocated, once during bootup, and never - * get freed - this significantly simplifies the debugging code. - */ -unsigned long nr_lock_classes; -static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; - -static inline struct lock_class *hlock_class(struct held_lock *hlock) -{ - if (!hlock->class_idx) { - /* - * Someone passed in garbage, we give up. - */ - DEBUG_LOCKS_WARN_ON(1); - return NULL; - } - return lock_classes + hlock->class_idx - 1; -} - -#ifdef CONFIG_LOCK_STAT -static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], - cpu_lock_stats); - -static inline u64 lockstat_clock(void) -{ - return local_clock(); -} - -static int lock_point(unsigned long points[], unsigned long ip) -{ - int i; - - for (i = 0; i < LOCKSTAT_POINTS; i++) { - if (points[i] == 0) { - points[i] = ip; - break; - } - if (points[i] == ip) - break; - } - - return i; -} - -static void lock_time_inc(struct lock_time *lt, u64 time) -{ - if (time > lt->max) - lt->max = time; - - if (time < lt->min || !lt->nr) - lt->min = time; - - lt->total += time; - lt->nr++; -} - -static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) -{ - if (!src->nr) - return; - - if (src->max > dst->max) - dst->max = src->max; - - if (src->min < dst->min || !dst->nr) - dst->min = src->min; - - dst->total += src->total; - dst->nr += src->nr; -} - -struct lock_class_stats lock_stats(struct lock_class *class) -{ - struct lock_class_stats stats; - int cpu, i; - - memset(&stats, 0, sizeof(struct lock_class_stats)); - for_each_possible_cpu(cpu) { - struct lock_class_stats *pcs = - &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; - - for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) - stats.contention_point[i] += pcs->contention_point[i]; - - for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) - stats.contending_point[i] += pcs->contending_point[i]; - - lock_time_add(&pcs->read_waittime, &stats.read_waittime); - lock_time_add(&pcs->write_waittime, &stats.write_waittime); - - lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); - lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); - - for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) - stats.bounces[i] += pcs->bounces[i]; - } - - return stats; -} - -void clear_lock_stats(struct lock_class *class) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct lock_class_stats *cpu_stats = - &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; - - memset(cpu_stats, 0, sizeof(struct lock_class_stats)); - } - memset(class->contention_point, 0, sizeof(class->contention_point)); - memset(class->contending_point, 0, sizeof(class->contending_point)); -} - -static struct lock_class_stats *get_lock_stats(struct lock_class *class) -{ - return &get_cpu_var(cpu_lock_stats)[class - lock_classes]; -} - -static void put_lock_stats(struct lock_class_stats *stats) -{ - put_cpu_var(cpu_lock_stats); -} - -static void lock_release_holdtime(struct held_lock *hlock) -{ - struct lock_class_stats *stats; - u64 holdtime; - - if (!lock_stat) - return; - - holdtime = lockstat_clock() - hlock->holdtime_stamp; - - stats = get_lock_stats(hlock_class(hlock)); - if (hlock->read) - lock_time_inc(&stats->read_holdtime, holdtime); - else - lock_time_inc(&stats->write_holdtime, holdtime); - put_lock_stats(stats); -} -#else -static inline void lock_release_holdtime(struct held_lock *hlock) -{ -} -#endif - -/* - * We keep a global list of all lock classes. The list only grows, - * never shrinks. The list is only accessed with the lockdep - * spinlock lock held. - */ -LIST_HEAD(all_lock_classes); - -/* - * The lockdep classes are in a hash-table as well, for fast lookup: - */ -#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) -#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) -#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) -#define classhashentry(key) (classhash_table + __classhashfn((key))) - -static struct list_head classhash_table[CLASSHASH_SIZE]; - -/* - * We put the lock dependency chains into a hash-table as well, to cache - * their existence: - */ -#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) -#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) -#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) -#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) - -static struct list_head chainhash_table[CHAINHASH_SIZE]; - -/* - * The hash key of the lock dependency chains is a hash itself too: - * it's a hash of all locks taken up to that lock, including that lock. - * It's a 64-bit hash, because it's important for the keys to be - * unique. - */ -#define iterate_chain_key(key1, key2) \ - (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ - ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ - (key2)) - -void lockdep_off(void) -{ - current->lockdep_recursion++; -} -EXPORT_SYMBOL(lockdep_off); - -void lockdep_on(void) -{ - current->lockdep_recursion--; -} -EXPORT_SYMBOL(lockdep_on); - -/* - * Debugging switches: - */ - -#define VERBOSE 0 -#define VERY_VERBOSE 0 - -#if VERBOSE -# define HARDIRQ_VERBOSE 1 -# define SOFTIRQ_VERBOSE 1 -# define RECLAIM_VERBOSE 1 -#else -# define HARDIRQ_VERBOSE 0 -# define SOFTIRQ_VERBOSE 0 -# define RECLAIM_VERBOSE 0 -#endif - -#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE -/* - * Quick filtering for interesting events: - */ -static int class_filter(struct lock_class *class) -{ -#if 0 - /* Example */ - if (class->name_version == 1 && - !strcmp(class->name, "lockname")) - return 1; - if (class->name_version == 1 && - !strcmp(class->name, "&struct->lockfield")) - return 1; -#endif - /* Filter everything else. 1 would be to allow everything else */ - return 0; -} -#endif - -static int verbose(struct lock_class *class) -{ -#if VERBOSE - return class_filter(class); -#endif - return 0; -} - -/* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. - */ -unsigned long nr_stack_trace_entries; -static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; - -static void print_lockdep_off(const char *bug_msg) -{ - printk(KERN_DEBUG "%s\n", bug_msg); - printk(KERN_DEBUG "turning off the locking correctness validator.\n"); - printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); -} - -static int save_trace(struct stack_trace *trace) -{ - trace->nr_entries = 0; - trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->entries = stack_trace + nr_stack_trace_entries; - - trace->skip = 3; - - save_stack_trace(trace); - - /* - * Some daft arches put -1 at the end to indicate its a full trace. - * - * this is buggy anyway, since it takes a whole extra entry so a - * complete trace that maxes out the entries provided will be reported - * as incomplete, friggin useless - */ - if (trace->nr_entries != 0 && - trace->entries[trace->nr_entries-1] == ULONG_MAX) - trace->nr_entries--; - - trace->max_entries = trace->nr_entries; - - nr_stack_trace_entries += trace->nr_entries; - - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { - if (!debug_locks_off_graph_unlock()) - return 0; - - print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); - dump_stack(); - - return 0; - } - - return 1; -} - -unsigned int nr_hardirq_chains; -unsigned int nr_softirq_chains; -unsigned int nr_process_chains; -unsigned int max_lockdep_depth; - -#ifdef CONFIG_DEBUG_LOCKDEP -/* - * We cannot printk in early bootup code. Not even early_printk() - * might work. So we mark any initialization errors and printk - * about it later on, in lockdep_info(). - */ -static int lockdep_init_error; -static const char *lock_init_error; -static unsigned long lockdep_init_trace_data[20]; -static struct stack_trace lockdep_init_trace = { - .max_entries = ARRAY_SIZE(lockdep_init_trace_data), - .entries = lockdep_init_trace_data, -}; - -/* - * Various lockdep statistics: - */ -DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); -#endif - -/* - * Locking printouts: - */ - -#define __USAGE(__STATE) \ - [LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \ - [LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \ - [LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\ - [LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R", - -static const char *usage_str[] = -{ -#define LOCKDEP_STATE(__STATE) __USAGE(__STATE) -#include "lockdep_states.h" -#undef LOCKDEP_STATE - [LOCK_USED] = "INITIAL USE", -}; - -const char * __get_key_name(struct lockdep_subclass_key *key, char *str) -{ - return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); -} - -static inline unsigned long lock_flag(enum lock_usage_bit bit) -{ - return 1UL << bit; -} - -static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) -{ - char c = '.'; - - if (class->usage_mask & lock_flag(bit + 2)) - c = '+'; - if (class->usage_mask & lock_flag(bit)) { - c = '-'; - if (class->usage_mask & lock_flag(bit + 2)) - c = '?'; - } - - return c; -} - -void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) -{ - int i = 0; - -#define LOCKDEP_STATE(__STATE) \ - usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \ - usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ); -#include "lockdep_states.h" -#undef LOCKDEP_STATE - - usage[i] = '\0'; -} - -static void __print_lock_name(struct lock_class *class) -{ - char str[KSYM_NAME_LEN]; - const char *name; - - name = class->name; - if (!name) { - name = __get_key_name(class->key, str); - printk("%s", name); - } else { - printk("%s", name); - if (class->name_version > 1) - printk("#%d", class->name_version); - if (class->subclass) - printk("/%d", class->subclass); - } -} - -static void print_lock_name(struct lock_class *class) -{ - char usage[LOCK_USAGE_CHARS]; - - get_usage_chars(class, usage); - - printk(" ("); - __print_lock_name(class); - printk("){%s}", usage); -} - -static void print_lockdep_cache(struct lockdep_map *lock) -{ - const char *name; - char str[KSYM_NAME_LEN]; - - name = lock->name; - if (!name) - name = __get_key_name(lock->key->subkeys, str); - - printk("%s", name); -} - -static void print_lock(struct held_lock *hlock) -{ - print_lock_name(hlock_class(hlock)); - printk(", at: "); - print_ip_sym(hlock->acquire_ip); -} - -static void lockdep_print_held_locks(struct task_struct *curr) -{ - int i, depth = curr->lockdep_depth; - - if (!depth) { - printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); - return; - } - printk("%d lock%s held by %s/%d:\n", - depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr)); - - for (i = 0; i < depth; i++) { - printk(" #%d: ", i); - print_lock(curr->held_locks + i); - } -} - -static void print_kernel_ident(void) -{ - printk("%s %.*s %s\n", init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, - print_tainted()); -} - -static int very_verbose(struct lock_class *class) -{ -#if VERY_VERBOSE - return class_filter(class); -#endif - return 0; -} - -/* - * Is this the address of a static object: - */ -static int static_obj(void *obj) -{ - unsigned long start = (unsigned long) &_stext, - end = (unsigned long) &_end, - addr = (unsigned long) obj; - - /* - * static variable? - */ - if ((addr >= start) && (addr < end)) - return 1; - - if (arch_is_kernel_data(addr)) - return 1; - - /* - * in-kernel percpu var? - */ - if (is_kernel_percpu_address(addr)) - return 1; - - /* - * module static or percpu var? - */ - return is_module_address(addr) || is_module_percpu_address(addr); -} - -/* - * To make lock name printouts unique, we calculate a unique - * class->name_version generation counter: - */ -static int count_matching_names(struct lock_class *new_class) -{ - struct lock_class *class; - int count = 0; - - if (!new_class->name) - return 0; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - if (new_class->key - new_class->subclass == class->key) - return class->name_version; - if (class->name && !strcmp(class->name, new_class->name)) - count = max(count, class->name_version); - } - - return count + 1; -} - -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ -static inline struct lock_class * -look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) -{ - struct lockdep_subclass_key *key; - struct list_head *hash_head; - struct lock_class *class; - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * If the architecture calls into lockdep before initializing - * the hashes then we'll warn about it later. (we cannot printk - * right now) - */ - if (unlikely(!lockdep_initialized)) { - lockdep_init(); - lockdep_init_error = 1; - lock_init_error = lock->name; - save_stack_trace(&lockdep_init_trace); - } -#endif - - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { - debug_locks_off(); - printk(KERN_ERR - "BUG: looking up invalid subclass: %u\n", subclass); - printk(KERN_ERR - "turning off the locking correctness validator.\n"); - dump_stack(); - return NULL; - } - - /* - * Static locks do not have their class-keys yet - for them the key - * is the lock object itself: - */ - if (unlikely(!lock->key)) - lock->key = (void *)lock; - - /* - * NOTE: the class-key must be unique. For dynamic locks, a static - * lock_class_key variable is passed in through the mutex_init() - * (or spin_lock_init()) call - which acts as the key. For static - * locks we use the lock object itself as the key. - */ - BUILD_BUG_ON(sizeof(struct lock_class_key) > - sizeof(struct lockdep_map)); - - key = lock->key->subkeys + subclass; - - hash_head = classhashentry(key); - - /* - * We can walk the hash lockfree, because the hash only - * grows, and we are careful when adding entries to the end: - */ - list_for_each_entry(class, hash_head, hash_entry) { - if (class->key == key) { - /* - * Huh! same key, different name? Did someone trample - * on some memory? We're most confused. - */ - WARN_ON_ONCE(class->name != lock->name); - return class; - } - } - - return NULL; -} - -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ -static inline struct lock_class * -register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) -{ - struct lockdep_subclass_key *key; - struct list_head *hash_head; - struct lock_class *class; - unsigned long flags; - - class = look_up_lock_class(lock, subclass); - if (likely(class)) - goto out_set_class_cache; - - /* - * Debug-check: all keys must be persistent! - */ - if (!static_obj(lock->key)) { - debug_locks_off(); - printk("INFO: trying to register non-static key.\n"); - printk("the code is fine but needs lockdep annotation.\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - - return NULL; - } - - key = lock->key->subkeys + subclass; - hash_head = classhashentry(key); - - raw_local_irq_save(flags); - if (!graph_lock()) { - raw_local_irq_restore(flags); - return NULL; - } - /* - * We have to do the hash-walk again, to avoid races - * with another CPU: - */ - list_for_each_entry(class, hash_head, hash_entry) - if (class->key == key) - goto out_unlock_set; - /* - * Allocate a new key from the static array, and add it to - * the hash: - */ - if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { - if (!debug_locks_off_graph_unlock()) { - raw_local_irq_restore(flags); - return NULL; - } - raw_local_irq_restore(flags); - - print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); - dump_stack(); - return NULL; - } - class = lock_classes + nr_lock_classes++; - debug_atomic_inc(nr_unused_locks); - class->key = key; - class->name = lock->name; - class->subclass = subclass; - INIT_LIST_HEAD(&class->lock_entry); - INIT_LIST_HEAD(&class->locks_before); - INIT_LIST_HEAD(&class->locks_after); - class->name_version = count_matching_names(class); - /* - * We use RCU's safe list-add method to make - * parallel walking of the hash-list safe: - */ - list_add_tail_rcu(&class->hash_entry, hash_head); - /* - * Add it to the global list of classes: - */ - list_add_tail_rcu(&class->lock_entry, &all_lock_classes); - - if (verbose(class)) { - graph_unlock(); - raw_local_irq_restore(flags); - - printk("\nnew class %p: %s", class->key, class->name); - if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); - dump_stack(); - - raw_local_irq_save(flags); - if (!graph_lock()) { - raw_local_irq_restore(flags); - return NULL; - } - } -out_unlock_set: - graph_unlock(); - raw_local_irq_restore(flags); - -out_set_class_cache: - if (!subclass || force) - lock->class_cache[0] = class; - else if (subclass < NR_LOCKDEP_CACHING_CLASSES) - lock->class_cache[subclass] = class; - - /* - * Hash collision, did we smoke some? We found a class with a matching - * hash but the subclass -- which is hashed in -- didn't match. - */ - if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) - return NULL; - - return class; -} - -#ifdef CONFIG_PROVE_LOCKING -/* - * Allocate a lockdep entry. (assumes the graph_lock held, returns - * with NULL on failure) - */ -static struct lock_list *alloc_list_entry(void) -{ - if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { - if (!debug_locks_off_graph_unlock()) - return NULL; - - print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); - dump_stack(); - return NULL; - } - return list_entries + nr_list_entries++; -} - -/* - * Add a new dependency to the head of the list: - */ -static int add_lock_to_list(struct lock_class *class, struct lock_class *this, - struct list_head *head, unsigned long ip, - int distance, struct stack_trace *trace) -{ - struct lock_list *entry; - /* - * Lock not present yet - get a new dependency struct and - * add it to the list: - */ - entry = alloc_list_entry(); - if (!entry) - return 0; - - entry->class = this; - entry->distance = distance; - entry->trace = *trace; - /* - * Since we never remove from the dependency list, the list can - * be walked lockless by other CPUs, it's only allocation - * that must be protected by the spinlock. But this also means - * we must make new entries visible only once writes to the - * entry become visible - hence the RCU op: - */ - list_add_tail_rcu(&entry->entry, head); - - return 1; -} - -/* - * For good efficiency of modular, we use power of 2 - */ -#define MAX_CIRCULAR_QUEUE_SIZE 4096UL -#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) - -/* - * The circular_queue and helpers is used to implement the - * breadth-first search(BFS)algorithem, by which we can build - * the shortest path from the next lock to be acquired to the - * previous held lock if there is a circular between them. - */ -struct circular_queue { - unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; - unsigned int front, rear; -}; - -static struct circular_queue lock_cq; - -unsigned int max_bfs_queue_depth; - -static unsigned int lockdep_dependency_gen_id; - -static inline void __cq_init(struct circular_queue *cq) -{ - cq->front = cq->rear = 0; - lockdep_dependency_gen_id++; -} - -static inline int __cq_empty(struct circular_queue *cq) -{ - return (cq->front == cq->rear); -} - -static inline int __cq_full(struct circular_queue *cq) -{ - return ((cq->rear + 1) & CQ_MASK) == cq->front; -} - -static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) -{ - if (__cq_full(cq)) - return -1; - - cq->element[cq->rear] = elem; - cq->rear = (cq->rear + 1) & CQ_MASK; - return 0; -} - -static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) -{ - if (__cq_empty(cq)) - return -1; - - *elem = cq->element[cq->front]; - cq->front = (cq->front + 1) & CQ_MASK; - return 0; -} - -static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) -{ - return (cq->rear - cq->front) & CQ_MASK; -} - -static inline void mark_lock_accessed(struct lock_list *lock, - struct lock_list *parent) -{ - unsigned long nr; - - nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ - lock->parent = parent; - lock->class->dep_gen_id = lockdep_dependency_gen_id; -} - -static inline unsigned long lock_accessed(struct lock_list *lock) -{ - unsigned long nr; - - nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ - return lock->class->dep_gen_id == lockdep_dependency_gen_id; -} - -static inline struct lock_list *get_lock_parent(struct lock_list *child) -{ - return child->parent; -} - -static inline int get_lock_depth(struct lock_list *child) -{ - int depth = 0; - struct lock_list *parent; - - while ((parent = get_lock_parent(child))) { - child = parent; - depth++; - } - return depth; -} - -static int __bfs(struct lock_list *source_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry, - int forward) -{ - struct lock_list *entry; - struct list_head *head; - struct circular_queue *cq = &lock_cq; - int ret = 1; - - if (match(source_entry, data)) { - *target_entry = source_entry; - ret = 0; - goto exit; - } - - if (forward) - head = &source_entry->class->locks_after; - else - head = &source_entry->class->locks_before; - - if (list_empty(head)) - goto exit; - - __cq_init(cq); - __cq_enqueue(cq, (unsigned long)source_entry); - - while (!__cq_empty(cq)) { - struct lock_list *lock; - - __cq_dequeue(cq, (unsigned long *)&lock); - - if (!lock->class) { - ret = -2; - goto exit; - } - - if (forward) - head = &lock->class->locks_after; - else - head = &lock->class->locks_before; - - list_for_each_entry(entry, head, entry) { - if (!lock_accessed(entry)) { - unsigned int cq_depth; - mark_lock_accessed(entry, lock); - if (match(entry, data)) { - *target_entry = entry; - ret = 0; - goto exit; - } - - if (__cq_enqueue(cq, (unsigned long)entry)) { - ret = -1; - goto exit; - } - cq_depth = __cq_get_elem_count(cq); - if (max_bfs_queue_depth < cq_depth) - max_bfs_queue_depth = cq_depth; - } - } - } -exit: - return ret; -} - -static inline int __bfs_forwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) -{ - return __bfs(src_entry, data, match, target_entry, 1); - -} - -static inline int __bfs_backwards(struct lock_list *src_entry, - void *data, - int (*match)(struct lock_list *entry, void *data), - struct lock_list **target_entry) -{ - return __bfs(src_entry, data, match, target_entry, 0); - -} - -/* - * Recursive, forwards-direction lock-dependency checking, used for - * both noncyclic checking and for hardirq-unsafe/softirq-unsafe - * checking. - */ - -/* - * Print a dependency chain entry (this is only done when a deadlock - * has been detected): - */ -static noinline int -print_circular_bug_entry(struct lock_list *target, int depth) -{ - if (debug_locks_silent) - return 0; - printk("\n-> #%u", depth); - print_lock_name(target->class); - printk(":\n"); - print_stack_trace(&target->trace, 6); - - return 0; -} - -static void -print_circular_lock_scenario(struct held_lock *src, - struct held_lock *tgt, - struct lock_list *prt) -{ - struct lock_class *source = hlock_class(src); - struct lock_class *target = hlock_class(tgt); - struct lock_class *parent = prt->class; - - /* - * A direct locking problem where unsafe_class lock is taken - * directly by safe_class lock, then all we need to show - * is the deadlock scenario, as it is obvious that the - * unsafe lock is taken under the safe lock. - * - * But if there is a chain instead, where the safe lock takes - * an intermediate lock (middle_class) where this lock is - * not the same as the safe lock, then the lock chain is - * used to describe the problem. Otherwise we would need - * to show a different CPU case for each link in the chain - * from the safe_class lock to the unsafe_class lock. - */ - if (parent != source) { - printk("Chain exists of:\n "); - __print_lock_name(source); - printk(" --> "); - __print_lock_name(parent); - printk(" --> "); - __print_lock_name(target); - printk("\n\n"); - } - - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); - printk(");\n"); - printk(" lock("); - __print_lock_name(parent); - printk(");\n"); - printk(" lock("); - __print_lock_name(target); - printk(");\n"); - printk(" lock("); - __print_lock_name(source); - printk(");\n"); - printk("\n *** DEADLOCK ***\n\n"); -} - -/* - * When a circular dependency is detected, print the - * header first: - */ -static noinline int -print_circular_bug_header(struct lock_list *entry, unsigned int depth, - struct held_lock *check_src, - struct held_lock *check_tgt) -{ - struct task_struct *curr = current; - - if (debug_locks_silent) - return 0; - - printk("\n"); - printk("======================================================\n"); - printk("[ INFO: possible circular locking dependency detected ]\n"); - print_kernel_ident(); - printk("-------------------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", - curr->comm, task_pid_nr(curr)); - print_lock(check_src); - printk("\nbut task is already holding lock:\n"); - print_lock(check_tgt); - printk("\nwhich lock already depends on the new lock.\n\n"); - printk("\nthe existing dependency chain (in reverse order) is:\n"); - - print_circular_bug_entry(entry, depth); - - return 0; -} - -static inline int class_equal(struct lock_list *entry, void *data) -{ - return entry->class == data; -} - -static noinline int print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt) -{ - struct task_struct *curr = current; - struct lock_list *parent; - struct lock_list *first_parent; - int depth; - - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - if (!save_trace(&this->trace)) - return 0; - - depth = get_lock_depth(target); - - print_circular_bug_header(target, depth, check_src, check_tgt); - - parent = get_lock_parent(target); - first_parent = parent; - - while (parent) { - print_circular_bug_entry(parent, --depth); - parent = get_lock_parent(parent); - } - - printk("\nother info that might help us debug this:\n\n"); - print_circular_lock_scenario(check_src, check_tgt, - first_parent); - - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -static noinline int print_bfs_bug(int ret) -{ - if (!debug_locks_off_graph_unlock()) - return 0; - - /* - * Breadth-first-search failed, graph got corrupted? - */ - WARN(1, "lockdep bfs error:%d\n", ret); - - return 0; -} - -static int noop_count(struct lock_list *entry, void *data) -{ - (*(unsigned long *)data)++; - return 0; -} - -unsigned long __lockdep_count_forward_deps(struct lock_list *this) -{ - unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); - - __bfs_forwards(this, (void *)&count, noop_count, &target_entry); - - return count; -} -unsigned long lockdep_count_forward_deps(struct lock_class *class) -{ - unsigned long ret, flags; - struct lock_list this; - - this.parent = NULL; - this.class = class; - - local_irq_save(flags); - arch_spin_lock(&lockdep_lock); - ret = __lockdep_count_forward_deps(&this); - arch_spin_unlock(&lockdep_lock); - local_irq_restore(flags); - - return ret; -} - -unsigned long __lockdep_count_backward_deps(struct lock_list *this) -{ - unsigned long count = 0; - struct lock_list *uninitialized_var(target_entry); - - __bfs_backwards(this, (void *)&count, noop_count, &target_entry); - - return count; -} - -unsigned long lockdep_count_backward_deps(struct lock_class *class) -{ - unsigned long ret, flags; - struct lock_list this; - - this.parent = NULL; - this.class = class; - - local_irq_save(flags); - arch_spin_lock(&lockdep_lock); - ret = __lockdep_count_backward_deps(&this); - arch_spin_unlock(&lockdep_lock); - local_irq_restore(flags); - - return ret; -} - -/* - * Prove that the dependency graph starting at can not - * lead to . Print an error and return 0 if it does. - */ -static noinline int -check_noncircular(struct lock_list *root, struct lock_class *target, - struct lock_list **target_entry) -{ - int result; - - debug_atomic_inc(nr_cyclic_checks); - - result = __bfs_forwards(root, target, class_equal, target_entry); - - return result; -} - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) -/* - * Forwards and backwards subgraph searching, for the purposes of - * proving that two subgraphs can be connected by a new dependency - * without creating any illegal irq-safe -> irq-unsafe lock dependency. - */ - -static inline int usage_match(struct lock_list *entry, void *bit) -{ - return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); -} - - - -/* - * Find a node in the forwards-direction dependency sub-graph starting - * at @root->class that matches @bit. - * - * Return 0 if such a node exists in the subgraph, and put that node - * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. - */ -static int -find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, - struct lock_list **target_entry) -{ - int result; - - debug_atomic_inc(nr_find_usage_forwards_checks); - - result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); - - return result; -} - -/* - * Find a node in the backwards-direction dependency sub-graph starting - * at @root->class that matches @bit. - * - * Return 0 if such a node exists in the subgraph, and put that node - * into *@target_entry. - * - * Return 1 otherwise and keep *@target_entry unchanged. - * Return <0 on error. - */ -static int -find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, - struct lock_list **target_entry) -{ - int result; - - debug_atomic_inc(nr_find_usage_backwards_checks); - - result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); - - return result; -} - -static void print_lock_class_header(struct lock_class *class, int depth) -{ - int bit; - - printk("%*s->", depth, ""); - print_lock_name(class); - printk(" ops: %lu", class->ops); - printk(" {\n"); - - for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { - if (class->usage_mask & (1 << bit)) { - int len = depth; - - len += printk("%*s %s", depth, "", usage_str[bit]); - len += printk(" at:\n"); - print_stack_trace(class->usage_traces + bit, len); - } - } - printk("%*s }\n", depth, ""); - - printk("%*s ... key at: ",depth,""); - print_ip_sym((unsigned long)class->key); -} - -/* - * printk the shortest lock dependencies from @start to @end in reverse order: - */ -static void __used -print_shortest_lock_dependencies(struct lock_list *leaf, - struct lock_list *root) -{ - struct lock_list *entry = leaf; - int depth; - - /*compute depth from generated tree by BFS*/ - depth = get_lock_depth(leaf); - - do { - print_lock_class_header(entry->class, depth); - printk("%*s ... acquired at:\n", depth, ""); - print_stack_trace(&entry->trace, 2); - printk("\n"); - - if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad path found in chain graph\n", __func__); - break; - } - - entry = get_lock_parent(entry); - depth--; - } while (entry && (depth >= 0)); - - return; -} - -static void -print_irq_lock_scenario(struct lock_list *safe_entry, - struct lock_list *unsafe_entry, - struct lock_class *prev_class, - struct lock_class *next_class) -{ - struct lock_class *safe_class = safe_entry->class; - struct lock_class *unsafe_class = unsafe_entry->class; - struct lock_class *middle_class = prev_class; - - if (middle_class == safe_class) - middle_class = next_class; - - /* - * A direct locking problem where unsafe_class lock is taken - * directly by safe_class lock, then all we need to show - * is the deadlock scenario, as it is obvious that the - * unsafe lock is taken under the safe lock. - * - * But if there is a chain instead, where the safe lock takes - * an intermediate lock (middle_class) where this lock is - * not the same as the safe lock, then the lock chain is - * used to describe the problem. Otherwise we would need - * to show a different CPU case for each link in the chain - * from the safe_class lock to the unsafe_class lock. - */ - if (middle_class != unsafe_class) { - printk("Chain exists of:\n "); - __print_lock_name(safe_class); - printk(" --> "); - __print_lock_name(middle_class); - printk(" --> "); - __print_lock_name(unsafe_class); - printk("\n\n"); - } - - printk(" Possible interrupt unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(unsafe_class); - printk(");\n"); - printk(" local_irq_disable();\n"); - printk(" lock("); - __print_lock_name(safe_class); - printk(");\n"); - printk(" lock("); - __print_lock_name(middle_class); - printk(");\n"); - printk(" \n"); - printk(" lock("); - __print_lock_name(safe_class); - printk(");\n"); - printk("\n *** DEADLOCK ***\n\n"); -} - -static int -print_bad_irq_dependency(struct task_struct *curr, - struct lock_list *prev_root, - struct lock_list *next_root, - struct lock_list *backwards_entry, - struct lock_list *forwards_entry, - struct held_lock *prev, - struct held_lock *next, - enum lock_usage_bit bit1, - enum lock_usage_bit bit2, - const char *irqclass) -{ - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n"); - printk("======================================================\n"); - printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", - irqclass, irqclass); - print_kernel_ident(); - printk("------------------------------------------------------\n"); - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", - curr->comm, task_pid_nr(curr), - curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, - curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, - curr->hardirqs_enabled, - curr->softirqs_enabled); - print_lock(next); - - printk("\nand this task is already holding:\n"); - print_lock(prev); - printk("which would create a new lock dependency:\n"); - print_lock_name(hlock_class(prev)); - printk(" ->"); - print_lock_name(hlock_class(next)); - printk("\n"); - - printk("\nbut this new dependency connects a %s-irq-safe lock:\n", - irqclass); - print_lock_name(backwards_entry->class); - printk("\n... which became %s-irq-safe at:\n", irqclass); - - print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); - - printk("\nto a %s-irq-unsafe lock:\n", irqclass); - print_lock_name(forwards_entry->class); - printk("\n... which became %s-irq-unsafe at:\n", irqclass); - printk("..."); - - print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); - - printk("\nother info that might help us debug this:\n\n"); - print_irq_lock_scenario(backwards_entry, forwards_entry, - hlock_class(prev), hlock_class(next)); - - lockdep_print_held_locks(curr); - - printk("\nthe dependencies between %s-irq-safe lock", irqclass); - printk(" and the holding lock:\n"); - if (!save_trace(&prev_root->trace)) - return 0; - print_shortest_lock_dependencies(backwards_entry, prev_root); - - printk("\nthe dependencies between the lock to be acquired"); - printk(" and %s-irq-unsafe lock:\n", irqclass); - if (!save_trace(&next_root->trace)) - return 0; - print_shortest_lock_dependencies(forwards_entry, next_root); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -static int -check_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit_backwards, - enum lock_usage_bit bit_forwards, const char *irqclass) -{ - int ret; - struct lock_list this, that; - struct lock_list *uninitialized_var(target_entry); - struct lock_list *uninitialized_var(target_entry1); - - this.parent = NULL; - - this.class = hlock_class(prev); - ret = find_usage_backwards(&this, bit_backwards, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - that.parent = NULL; - that.class = hlock_class(next); - ret = find_usage_forwards(&that, bit_forwards, &target_entry1); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - return print_bad_irq_dependency(curr, &this, &that, - target_entry, target_entry1, - prev, next, - bit_backwards, bit_forwards, irqclass); -} - -static const char *state_names[] = { -#define LOCKDEP_STATE(__STATE) \ - __stringify(__STATE), -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - -static const char *state_rnames[] = { -#define LOCKDEP_STATE(__STATE) \ - __stringify(__STATE)"-READ", -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - -static inline const char *state_name(enum lock_usage_bit bit) -{ - return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; -} - -static int exclusive_bit(int new_bit) -{ - /* - * USED_IN - * USED_IN_READ - * ENABLED - * ENABLED_READ - * - * bit 0 - write/read - * bit 1 - used_in/enabled - * bit 2+ state - */ - - int state = new_bit & ~3; - int dir = new_bit & 2; - - /* - * keep state, bit flip the direction and strip read. - */ - return state | (dir ^ 2); -} - -static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit) -{ - /* - * Prove that the new dependency does not connect a hardirq-safe - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) - return 0; - - bit++; /* _READ */ - - /* - * Prove that the new dependency does not connect a hardirq-safe-read - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at , and the - * forwards-subgraph starting at : - */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) - return 0; - - return 1; -} - -static int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ -#define LOCKDEP_STATE(__STATE) \ - if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ - return 0; -#include "lockdep_states.h" -#undef LOCKDEP_STATE - - return 1; -} - -static void inc_chains(void) -{ - if (current->hardirq_context) - nr_hardirq_chains++; - else { - if (current->softirq_context) - nr_softirq_chains++; - else - nr_process_chains++; - } -} - -#else - -static inline int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ - return 1; -} - -static inline void inc_chains(void) -{ - nr_process_chains++; -} - -#endif - -static void -print_deadlock_scenario(struct held_lock *nxt, - struct held_lock *prv) -{ - struct lock_class *next = hlock_class(nxt); - struct lock_class *prev = hlock_class(prv); - - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0\n"); - printk(" ----\n"); - printk(" lock("); - __print_lock_name(prev); - printk(");\n"); - printk(" lock("); - __print_lock_name(next); - printk(");\n"); - printk("\n *** DEADLOCK ***\n\n"); - printk(" May be due to missing lock nesting notation\n\n"); -} - -static int -print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n"); - printk("=============================================\n"); - printk("[ INFO: possible recursive locking detected ]\n"); - print_kernel_ident(); - printk("---------------------------------------------\n"); - printk("%s/%d is trying to acquire lock:\n", - curr->comm, task_pid_nr(curr)); - print_lock(next); - printk("\nbut task is already holding lock:\n"); - print_lock(prev); - - printk("\nother info that might help us debug this:\n"); - print_deadlock_scenario(next, prev); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -/* - * Check whether we are holding such a class already. - * - * (Note that this has to be done separately, because the graph cannot - * detect such classes of deadlocks.) - * - * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read - */ -static int -check_deadlock(struct task_struct *curr, struct held_lock *next, - struct lockdep_map *next_instance, int read) -{ - struct held_lock *prev; - struct held_lock *nest = NULL; - int i; - - for (i = 0; i < curr->lockdep_depth; i++) { - prev = curr->held_locks + i; - - if (prev->instance == next->nest_lock) - nest = prev; - - if (hlock_class(prev) != hlock_class(next)) - continue; - - /* - * Allow read-after-read recursion of the same - * lock class (i.e. read_lock(lock)+read_lock(lock)): - */ - if ((read == 2) && prev->read) - return 2; - - /* - * We're holding the nest_lock, which serializes this lock's - * nesting behaviour. - */ - if (nest) - return 2; - - return print_deadlock_bug(curr, prev, next); - } - return 1; -} - -/* - * There was a chain-cache miss, and we are about to add a new dependency - * to a previous lock. We recursively validate the following rules: - * - * - would the adding of the -> dependency create a - * circular dependency in the graph? [== circular deadlock] - * - * - does the new prev->next dependency connect any hardirq-safe lock - * (in the full backwards-subgraph starting at ) with any - * hardirq-unsafe lock (in the full forwards-subgraph starting at - * )? [== illegal lock inversion with hardirq contexts] - * - * - does the new prev->next dependency connect any softirq-safe lock - * (in the full backwards-subgraph starting at ) with any - * softirq-unsafe lock (in the full forwards-subgraph starting at - * )? [== illegal lock inversion with softirq contexts] - * - * any of these scenarios could lead to a deadlock. - * - * Then if all the validations pass, we add the forwards and backwards - * dependency. - */ -static int -check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int trylock_loop) -{ - struct lock_list *entry; - int ret; - struct lock_list this; - struct lock_list *uninitialized_var(target_entry); - /* - * Static variable, serialized by the graph_lock(). - * - * We use this static variable to save the stack trace in case - * we call into this function multiple times due to encountering - * trylocks in the held lock stack. - */ - static struct stack_trace trace; - - /* - * Prove that the new -> dependency would not - * create a circular dependency in the graph. (We do this by - * forward-recursing into the graph starting at , and - * checking whether we can reach .) - * - * We are using global variables to control the recursion, to - * keep the stackframe size of the recursive functions low: - */ - this.class = hlock_class(next); - this.parent = NULL; - ret = check_noncircular(&this, hlock_class(prev), &target_entry); - if (unlikely(!ret)) - return print_circular_bug(&this, target_entry, next, prev); - else if (unlikely(ret < 0)) - return print_bfs_bug(ret); - - if (!check_prev_add_irq(curr, prev, next)) - return 0; - - /* - * For recursive read-locks we do all the dependency checks, - * but we dont store read-triggered dependencies (only - * write-triggered dependencies). This ensures that only the - * write-side dependencies matter, and that if for example a - * write-lock never takes any other locks, then the reads are - * equivalent to a NOP. - */ - if (next->read == 2 || prev->read == 2) - return 1; - /* - * Is the -> dependency already present? - * - * (this may occur even though this is a new chain: consider - * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3 - * chains - the second one will be new, but L1 already has - * L2 added to its dependency list, due to the first chain.) - */ - list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) { - if (entry->class == hlock_class(next)) { - if (distance == 1) - entry->distance = 1; - return 2; - } - } - - if (!trylock_loop && !save_trace(&trace)) - return 0; - - /* - * Ok, all validations passed, add the new lock - * to the previous lock's dependency list: - */ - ret = add_lock_to_list(hlock_class(prev), hlock_class(next), - &hlock_class(prev)->locks_after, - next->acquire_ip, distance, &trace); - - if (!ret) - return 0; - - ret = add_lock_to_list(hlock_class(next), hlock_class(prev), - &hlock_class(next)->locks_before, - next->acquire_ip, distance, &trace); - if (!ret) - return 0; - - /* - * Debugging printouts: - */ - if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { - graph_unlock(); - printk("\n new dependency: "); - print_lock_name(hlock_class(prev)); - printk(" => "); - print_lock_name(hlock_class(next)); - printk("\n"); - dump_stack(); - return graph_lock(); - } - return 1; -} - -/* - * Add the dependency to all directly-previous locks that are 'relevant'. - * The ones that are relevant are (in increasing distance from curr): - * all consecutive trylock entries and the final non-trylock entry - or - * the end of this context's lock-chain - whichever comes first. - */ -static int -check_prevs_add(struct task_struct *curr, struct held_lock *next) -{ - int depth = curr->lockdep_depth; - int trylock_loop = 0; - struct held_lock *hlock; - - /* - * Debugging checks. - * - * Depth must not be zero for a non-head lock: - */ - if (!depth) - goto out_bug; - /* - * At least two relevant locks must exist for this - * to be a head: - */ - if (curr->held_locks[depth].irq_context != - curr->held_locks[depth-1].irq_context) - goto out_bug; - - for (;;) { - int distance = curr->lockdep_depth - depth + 1; - hlock = curr->held_locks + depth-1; - /* - * Only non-recursive-read entries get new dependencies - * added: - */ - if (hlock->read != 2) { - if (!check_prev_add(curr, hlock, next, - distance, trylock_loop)) - return 0; - /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: - */ - if (!hlock->trylock) - break; - } - depth--; - /* - * End of lock-stack? - */ - if (!depth) - break; - /* - * Stop the search if we cross into another context: - */ - if (curr->held_locks[depth].irq_context != - curr->held_locks[depth-1].irq_context) - break; - trylock_loop = 1; - } - return 1; -out_bug: - if (!debug_locks_off_graph_unlock()) - return 0; - - /* - * Clearly we all shouldn't be here, but since we made it we - * can reliable say we messed up our state. See the above two - * gotos for reasons why we could possibly end up here. - */ - WARN_ON(1); - - return 0; -} - -unsigned long nr_lock_chains; -struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; -int nr_chain_hlocks; -static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; - -struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) -{ - return lock_classes + chain_hlocks[chain->base + i]; -} - -/* - * Look up a dependency chain. If the key is not present yet then - * add it and return 1 - in this case the new dependency chain is - * validated. If the key is already hashed, return 0. - * (On return with 1 graph_lock is held.) - */ -static inline int lookup_chain_cache(struct task_struct *curr, - struct held_lock *hlock, - u64 chain_key) -{ - struct lock_class *class = hlock_class(hlock); - struct list_head *hash_head = chainhashentry(chain_key); - struct lock_chain *chain; - struct held_lock *hlock_curr; - int i, j; - - /* - * We might need to take the graph lock, ensure we've got IRQs - * disabled to make this an IRQ-safe lock.. for recursion reasons - * lockdep won't complain about its own locking errors. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - /* - * We can walk it lock-free, because entries only get added - * to the hash: - */ - list_for_each_entry(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { -cache_hit: - debug_atomic_inc(chain_lookup_hits); - if (very_verbose(class)) - printk("\nhash chain already cached, key: " - "%016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, - class->key, class->name); - return 0; - } - } - if (very_verbose(class)) - printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", - (unsigned long long)chain_key, class->key, class->name); - /* - * Allocate a new chain entry from the static array, and add - * it to the hash: - */ - if (!graph_lock()) - return 0; - /* - * We have to walk the chain again locked - to avoid duplicates: - */ - list_for_each_entry(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { - graph_unlock(); - goto cache_hit; - } - } - if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { - if (!debug_locks_off_graph_unlock()) - return 0; - - print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); - dump_stack(); - return 0; - } - chain = lock_chains + nr_lock_chains++; - chain->chain_key = chain_key; - chain->irq_context = hlock->irq_context; - /* Find the first held_lock of current chain */ - for (i = curr->lockdep_depth - 1; i >= 0; i--) { - hlock_curr = curr->held_locks + i; - if (hlock_curr->irq_context != hlock->irq_context) - break; - } - i++; - chain->depth = curr->lockdep_depth + 1 - i; - if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { - chain->base = nr_chain_hlocks; - nr_chain_hlocks += chain->depth; - for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx - 1; - chain_hlocks[chain->base + j] = lock_id; - } - chain_hlocks[chain->base + j] = class - lock_classes; - } - list_add_tail_rcu(&chain->entry, hash_head); - debug_atomic_inc(chain_lookup_misses); - inc_chains(); - - return 1; -} - -static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, - struct held_lock *hlock, int chain_head, u64 chain_key) -{ - /* - * Trylock needs to maintain the stack of held locks, but it - * does not add new dependencies, because trylock can be done - * in any order. - * - * We look up the chain_key and do the O(N^2) check and update of - * the dependencies only if this is a new dependency chain. - * (If lookup_chain_cache() returns with 1 it acquires - * graph_lock for us) - */ - if (!hlock->trylock && (hlock->check == 2) && - lookup_chain_cache(curr, hlock, chain_key)) { - /* - * Check whether last held lock: - * - * - is irq-safe, if this lock is irq-unsafe - * - is softirq-safe, if this lock is hardirq-unsafe - * - * And check whether the new lock's dependency graph - * could lead back to the previous lock. - * - * any of these scenarios could lead to a deadlock. If - * All validations - */ - int ret = check_deadlock(curr, hlock, lock, hlock->read); - - if (!ret) - return 0; - /* - * Mark recursive read, as we jump over it when - * building dependencies (just like we jump over - * trylock entries): - */ - if (ret == 2) - hlock->read = 2; - /* - * Add dependency only if this lock is not the head - * of the chain, and if it's not a secondary read-lock: - */ - if (!chain_head && ret != 2) - if (!check_prevs_add(curr, hlock)) - return 0; - graph_unlock(); - } else - /* after lookup_chain_cache(): */ - if (unlikely(!debug_locks)) - return 0; - - return 1; -} -#else -static inline int validate_chain(struct task_struct *curr, - struct lockdep_map *lock, struct held_lock *hlock, - int chain_head, u64 chain_key) -{ - return 1; -} -#endif - -/* - * We are building curr_chain_key incrementally, so double-check - * it from scratch, to make sure that it's done correctly: - */ -static void check_chain_key(struct task_struct *curr) -{ -#ifdef CONFIG_DEBUG_LOCKDEP - struct held_lock *hlock, *prev_hlock = NULL; - unsigned int i, id; - u64 chain_key = 0; - - for (i = 0; i < curr->lockdep_depth; i++) { - hlock = curr->held_locks + i; - if (chain_key != hlock->prev_chain_key) { - debug_locks_off(); - /* - * We got mighty confused, our chain keys don't match - * with what we expect, someone trample on our task state? - */ - WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", - curr->lockdep_depth, i, - (unsigned long long)chain_key, - (unsigned long long)hlock->prev_chain_key); - return; - } - id = hlock->class_idx - 1; - /* - * Whoops ran out of static storage again? - */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) - return; - - if (prev_hlock && (prev_hlock->irq_context != - hlock->irq_context)) - chain_key = 0; - chain_key = iterate_chain_key(chain_key, id); - prev_hlock = hlock; - } - if (chain_key != curr->curr_chain_key) { - debug_locks_off(); - /* - * More smoking hash instead of calculating it, damn see these - * numbers float.. I bet that a pink elephant stepped on my memory. - */ - WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", - curr->lockdep_depth, i, - (unsigned long long)chain_key, - (unsigned long long)curr->curr_chain_key); - } -#endif -} - -static void -print_usage_bug_scenario(struct held_lock *lock) -{ - struct lock_class *class = hlock_class(lock); - - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0\n"); - printk(" ----\n"); - printk(" lock("); - __print_lock_name(class); - printk(");\n"); - printk(" \n"); - printk(" lock("); - __print_lock_name(class); - printk(");\n"); - printk("\n *** DEADLOCK ***\n\n"); -} - -static int -print_usage_bug(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) -{ - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n"); - printk("=================================\n"); - printk("[ INFO: inconsistent lock state ]\n"); - print_kernel_ident(); - printk("---------------------------------\n"); - - printk("inconsistent {%s} -> {%s} usage.\n", - usage_str[prev_bit], usage_str[new_bit]); - - printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", - curr->comm, task_pid_nr(curr), - trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, - trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, - trace_hardirqs_enabled(curr), - trace_softirqs_enabled(curr)); - print_lock(this); - - printk("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); - - print_irqtrace_events(curr); - printk("\nother info that might help us debug this:\n"); - print_usage_bug_scenario(this); - - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -/* - * Print out an error if an invalid bit is set: - */ -static inline int -valid_state(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) -{ - if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) - return print_usage_bug(curr, this, bad_bit, new_bit); - return 1; -} - -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit); - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) - -/* - * print irq inversion bug: - */ -static int -print_irq_inversion_bug(struct task_struct *curr, - struct lock_list *root, struct lock_list *other, - struct held_lock *this, int forwards, - const char *irqclass) -{ - struct lock_list *entry = other; - struct lock_list *middle = NULL; - int depth; - - if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; - - printk("\n"); - printk("=========================================================\n"); - printk("[ INFO: possible irq lock inversion dependency detected ]\n"); - print_kernel_ident(); - printk("---------------------------------------------------------\n"); - printk("%s/%d just changed the state of lock:\n", - curr->comm, task_pid_nr(curr)); - print_lock(this); - if (forwards) - printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); - else - printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); - print_lock_name(other->class); - printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); - - printk("\nother info that might help us debug this:\n"); - - /* Find a middle lock (if one exists) */ - depth = get_lock_depth(other); - do { - if (depth == 0 && (entry != root)) { - printk("lockdep:%s bad path found in chain graph\n", __func__); - break; - } - middle = entry; - entry = get_lock_parent(entry); - depth--; - } while (entry && entry != root && (depth >= 0)); - if (forwards) - print_irq_lock_scenario(root, other, - middle ? middle->class : root->class, other->class); - else - print_irq_lock_scenario(other, root, - middle ? middle->class : other->class, root->class); - - lockdep_print_held_locks(curr); - - printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); - if (!save_trace(&root->trace)) - return 0; - print_shortest_lock_dependencies(other, root); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -/* - * Prove that in the forwards-direction subgraph starting at - * there is no lock matching : - */ -static int -check_usage_forwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) -{ - int ret; - struct lock_list root; - struct lock_list *uninitialized_var(target_entry); - - root.parent = NULL; - root.class = hlock_class(this); - ret = find_usage_forwards(&root, bit, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - return print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); -} - -/* - * Prove that in the backwards-direction subgraph starting at - * there is no lock matching : - */ -static int -check_usage_backwards(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit bit, const char *irqclass) -{ - int ret; - struct lock_list root; - struct lock_list *uninitialized_var(target_entry); - - root.parent = NULL; - root.class = hlock_class(this); - ret = find_usage_backwards(&root, bit, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - return print_irq_inversion_bug(curr, &root, target_entry, - this, 0, irqclass); -} - -void print_irqtrace_events(struct task_struct *curr) -{ - printk("irq event stamp: %u\n", curr->irq_events); - printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); - print_ip_sym(curr->hardirq_enable_ip); - printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); - print_ip_sym(curr->hardirq_disable_ip); - printk("softirqs last enabled at (%u): ", curr->softirq_enable_event); - print_ip_sym(curr->softirq_enable_ip); - printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); - print_ip_sym(curr->softirq_disable_ip); -} - -static int HARDIRQ_verbose(struct lock_class *class) -{ -#if HARDIRQ_VERBOSE - return class_filter(class); -#endif - return 0; -} - -static int SOFTIRQ_verbose(struct lock_class *class) -{ -#if SOFTIRQ_VERBOSE - return class_filter(class); -#endif - return 0; -} - -static int RECLAIM_FS_verbose(struct lock_class *class) -{ -#if RECLAIM_VERBOSE - return class_filter(class); -#endif - return 0; -} - -#define STRICT_READ_CHECKS 1 - -static int (*state_verbose_f[])(struct lock_class *class) = { -#define LOCKDEP_STATE(__STATE) \ - __STATE##_verbose, -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - -static inline int state_verbose(enum lock_usage_bit bit, - struct lock_class *class) -{ - return state_verbose_f[bit >> 2](class); -} - -typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, - enum lock_usage_bit bit, const char *name); - -static int -mark_lock_irq(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - int excl_bit = exclusive_bit(new_bit); - int read = new_bit & 1; - int dir = new_bit & 2; - - /* - * mark USED_IN has to look forwards -- to ensure no dependency - * has ENABLED state, which would allow recursion deadlocks. - * - * mark ENABLED has to look backwards -- to ensure no dependee - * has USED_IN state, which, again, would allow recursion deadlocks. - */ - check_usage_f usage = dir ? - check_usage_backwards : check_usage_forwards; - - /* - * Validate that this particular lock does not have conflicting - * usage states. - */ - if (!valid_state(curr, this, new_bit, excl_bit)) - return 0; - - /* - * Validate that the lock dependencies don't have conflicting usage - * states. - */ - if ((!read || !dir || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit & ~1))) - return 0; - - /* - * Check for read in write conflicts - */ - if (!read) { - if (!valid_state(curr, this, new_bit, excl_bit + 1)) - return 0; - - if (STRICT_READ_CHECKS && - !usage(curr, this, excl_bit + 1, - state_name(new_bit + 1))) - return 0; - } - - if (state_verbose(new_bit, hlock_class(this))) - return 2; - - return 1; -} - -enum mark_type { -#define LOCKDEP_STATE(__STATE) __STATE, -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - -/* - * Mark all held locks with a usage bit: - */ -static int -mark_held_locks(struct task_struct *curr, enum mark_type mark) -{ - enum lock_usage_bit usage_bit; - struct held_lock *hlock; - int i; - - for (i = 0; i < curr->lockdep_depth; i++) { - hlock = curr->held_locks + i; - - usage_bit = 2 + (mark << 2); /* ENABLED */ - if (hlock->read) - usage_bit += 1; /* READ */ - - BUG_ON(usage_bit >= LOCK_USAGE_STATES); - - if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) - continue; - - if (!mark_lock(curr, hlock, usage_bit)) - return 0; - } - - return 1; -} - -/* - * Hardirqs will be enabled: - */ -static void __trace_hardirqs_on_caller(unsigned long ip) -{ - struct task_struct *curr = current; - - /* we'll do an OFF -> ON transition: */ - curr->hardirqs_enabled = 1; - - /* - * We are going to turn hardirqs on, so set the - * usage bit for all held locks: - */ - if (!mark_held_locks(curr, HARDIRQ)) - return; - /* - * If we have softirqs enabled, then set the usage - * bit for all held locks. (disabled hardirqs prevented - * this bit from being set before) - */ - if (curr->softirqs_enabled) - if (!mark_held_locks(curr, SOFTIRQ)) - return; - - curr->hardirq_enable_ip = ip; - curr->hardirq_enable_event = ++curr->irq_events; - debug_atomic_inc(hardirqs_on_events); -} - -void trace_hardirqs_on_caller(unsigned long ip) -{ - time_hardirqs_on(CALLER_ADDR0, ip); - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - if (unlikely(current->hardirqs_enabled)) { - /* - * Neither irq nor preemption are disabled here - * so this is racy by nature but losing one hit - * in a stat is not a big deal. - */ - __debug_atomic_inc(redundant_hardirqs_on); - return; - } - - /* - * We're enabling irqs and according to our state above irqs weren't - * already enabled, yet we find the hardware thinks they are in fact - * enabled.. someone messed up their IRQ state tracing. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - - /* - * See the fine text that goes along with this variable definition. - */ - if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) - return; - - /* - * Can't allow enabling interrupts while in an interrupt handler, - * that's general bad form and such. Recursion, limited stack etc.. - */ - if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) - return; - - current->lockdep_recursion = 1; - __trace_hardirqs_on_caller(ip); - current->lockdep_recursion = 0; -} -EXPORT_SYMBOL(trace_hardirqs_on_caller); - -void trace_hardirqs_on(void) -{ - trace_hardirqs_on_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_on); - -/* - * Hardirqs were disabled: - */ -void trace_hardirqs_off_caller(unsigned long ip) -{ - struct task_struct *curr = current; - - time_hardirqs_off(CALLER_ADDR0, ip); - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - /* - * So we're supposed to get called after you mask local IRQs, but for - * some reason the hardware doesn't quite think you did a proper job. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - - if (curr->hardirqs_enabled) { - /* - * We have done an ON -> OFF transition: - */ - curr->hardirqs_enabled = 0; - curr->hardirq_disable_ip = ip; - curr->hardirq_disable_event = ++curr->irq_events; - debug_atomic_inc(hardirqs_off_events); - } else - debug_atomic_inc(redundant_hardirqs_off); -} -EXPORT_SYMBOL(trace_hardirqs_off_caller); - -void trace_hardirqs_off(void) -{ - trace_hardirqs_off_caller(CALLER_ADDR0); -} -EXPORT_SYMBOL(trace_hardirqs_off); - -/* - * Softirqs will be enabled: - */ -void trace_softirqs_on(unsigned long ip) -{ - struct task_struct *curr = current; - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - /* - * We fancy IRQs being disabled here, see softirq.c, avoids - * funny state and nesting things. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - - if (curr->softirqs_enabled) { - debug_atomic_inc(redundant_softirqs_on); - return; - } - - current->lockdep_recursion = 1; - /* - * We'll do an OFF -> ON transition: - */ - curr->softirqs_enabled = 1; - curr->softirq_enable_ip = ip; - curr->softirq_enable_event = ++curr->irq_events; - debug_atomic_inc(softirqs_on_events); - /* - * We are going to turn softirqs on, so set the - * usage bit for all held locks, if hardirqs are - * enabled too: - */ - if (curr->hardirqs_enabled) - mark_held_locks(curr, SOFTIRQ); - current->lockdep_recursion = 0; -} - -/* - * Softirqs were disabled: - */ -void trace_softirqs_off(unsigned long ip) -{ - struct task_struct *curr = current; - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - /* - * We fancy IRQs being disabled here, see softirq.c - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return; - - if (curr->softirqs_enabled) { - /* - * We have done an ON -> OFF transition: - */ - curr->softirqs_enabled = 0; - curr->softirq_disable_ip = ip; - curr->softirq_disable_event = ++curr->irq_events; - debug_atomic_inc(softirqs_off_events); - /* - * Whoops, we wanted softirqs off, so why aren't they? - */ - DEBUG_LOCKS_WARN_ON(!softirq_count()); - } else - debug_atomic_inc(redundant_softirqs_off); -} - -static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) -{ - struct task_struct *curr = current; - - if (unlikely(!debug_locks)) - return; - - /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_WAIT)) - return; - - /* this guy won't enter reclaim */ - if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) - return; - - /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS)) - return; - - /* - * Oi! Can't be having __GFP_FS allocations with IRQs disabled. - */ - if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) - return; - - mark_held_locks(curr, RECLAIM_FS); -} - -static void check_flags(unsigned long flags); - -void lockdep_trace_alloc(gfp_t gfp_mask) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - __lockdep_trace_alloc(gfp_mask, flags); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} - -static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) -{ - /* - * If non-trylock use in a hardirq or softirq context, then - * mark the lock as used in these contexts: - */ - if (!hlock->trylock) { - if (hlock->read) { - if (curr->hardirq_context) - if (!mark_lock(curr, hlock, - LOCK_USED_IN_HARDIRQ_READ)) - return 0; - if (curr->softirq_context) - if (!mark_lock(curr, hlock, - LOCK_USED_IN_SOFTIRQ_READ)) - return 0; - } else { - if (curr->hardirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) - return 0; - if (curr->softirq_context) - if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) - return 0; - } - } - if (!hlock->hardirqs_off) { - if (hlock->read) { - if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQ_READ)) - return 0; - if (curr->softirqs_enabled) - if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQ_READ)) - return 0; - } else { - if (!mark_lock(curr, hlock, - LOCK_ENABLED_HARDIRQ)) - return 0; - if (curr->softirqs_enabled) - if (!mark_lock(curr, hlock, - LOCK_ENABLED_SOFTIRQ)) - return 0; - } - } - - /* - * We reuse the irq context infrastructure more broadly as a general - * context checking code. This tests GFP_FS recursion (a lock taken - * during reclaim for a GFP_FS allocation is held over a GFP_FS - * allocation). - */ - if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { - if (hlock->read) { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) - return 0; - } else { - if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) - return 0; - } - } - - return 1; -} - -static int separate_irq_context(struct task_struct *curr, - struct held_lock *hlock) -{ - unsigned int depth = curr->lockdep_depth; - - /* - * Keep track of points where we cross into an interrupt context: - */ - hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + - curr->softirq_context; - if (depth) { - struct held_lock *prev_hlock; - - prev_hlock = curr->held_locks + depth-1; - /* - * If we cross into another context, reset the - * hash key (this also prevents the checking and the - * adding of the dependency to 'prev'): - */ - if (prev_hlock->irq_context != hlock->irq_context) - return 1; - } - return 0; -} - -#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - -static inline -int mark_lock_irq(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ - return 1; -} - -static inline int mark_irqflags(struct task_struct *curr, - struct held_lock *hlock) -{ - return 1; -} - -static inline int separate_irq_context(struct task_struct *curr, - struct held_lock *hlock) -{ - return 0; -} - -void lockdep_trace_alloc(gfp_t gfp_mask) -{ -} - -#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - -/* - * Mark a lock with a usage bit, and validate the state transition: - */ -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - unsigned int new_mask = 1 << new_bit, ret = 1; - - /* - * If already set then do not dirty the cacheline, - * nor do any checks: - */ - if (likely(hlock_class(this)->usage_mask & new_mask)) - return 1; - - if (!graph_lock()) - return 0; - /* - * Make sure we didn't race: - */ - if (unlikely(hlock_class(this)->usage_mask & new_mask)) { - graph_unlock(); - return 1; - } - - hlock_class(this)->usage_mask |= new_mask; - - if (!save_trace(hlock_class(this)->usage_traces + new_bit)) - return 0; - - switch (new_bit) { -#define LOCKDEP_STATE(__STATE) \ - case LOCK_USED_IN_##__STATE: \ - case LOCK_USED_IN_##__STATE##_READ: \ - case LOCK_ENABLED_##__STATE: \ - case LOCK_ENABLED_##__STATE##_READ: -#include "lockdep_states.h" -#undef LOCKDEP_STATE - ret = mark_lock_irq(curr, this, new_bit); - if (!ret) - return 0; - break; - case LOCK_USED: - debug_atomic_dec(nr_unused_locks); - break; - default: - if (!debug_locks_off_graph_unlock()) - return 0; - WARN_ON(1); - return 0; - } - - graph_unlock(); - - /* - * We must printk outside of the graph_lock: - */ - if (ret == 2) { - printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); - print_lock(this); - print_irqtrace_events(curr); - dump_stack(); - } - - return ret; -} - -/* - * Initialize a lock instance's lock-class mapping info: - */ -void lockdep_init_map(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) -{ - int i; - - kmemcheck_mark_initialized(lock, sizeof(*lock)); - - for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) - lock->class_cache[i] = NULL; - -#ifdef CONFIG_LOCK_STAT - lock->cpu = raw_smp_processor_id(); -#endif - - /* - * Can't be having no nameless bastards around this place! - */ - if (DEBUG_LOCKS_WARN_ON(!name)) { - lock->name = "NULL"; - return; - } - - lock->name = name; - - /* - * No key, no joy, we need to hash something. - */ - if (DEBUG_LOCKS_WARN_ON(!key)) - return; - /* - * Sanity check, the lock-class key must be persistent: - */ - if (!static_obj(key)) { - printk("BUG: key %p not in .data!\n", key); - /* - * What it says above ^^^^^, I suggest you read it. - */ - DEBUG_LOCKS_WARN_ON(1); - return; - } - lock->key = key; - - if (unlikely(!debug_locks)) - return; - - if (subclass) - register_lock_class(lock, subclass, 1); -} -EXPORT_SYMBOL_GPL(lockdep_init_map); - -struct lock_class_key __lockdep_no_validate__; -EXPORT_SYMBOL_GPL(__lockdep_no_validate__); - -static int -print_lock_nested_lock_not_held(struct task_struct *curr, - struct held_lock *hlock, - unsigned long ip) -{ - if (!debug_locks_off()) - return 0; - if (debug_locks_silent) - return 0; - - printk("\n"); - printk("==================================\n"); - printk("[ BUG: Nested lock was not taken ]\n"); - print_kernel_ident(); - printk("----------------------------------\n"); - - printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); - print_lock(hlock); - - printk("\nbut this task is not holding:\n"); - printk("%s\n", hlock->nest_lock->name); - - printk("\nstack backtrace:\n"); - dump_stack(); - - printk("\nother info that might help us debug this:\n"); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -static int __lock_is_held(struct lockdep_map *lock); - -/* - * This gets called for every mutex_lock*()/spin_lock*() operation. - * We maintain the dependency maps and validate the locking attempt: - */ -static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, int hardirqs_off, - struct lockdep_map *nest_lock, unsigned long ip, - int references) -{ - struct task_struct *curr = current; - struct lock_class *class = NULL; - struct held_lock *hlock; - unsigned int depth, id; - int chain_head = 0; - int class_idx; - u64 chain_key; - - if (!prove_locking) - check = 1; - - if (unlikely(!debug_locks)) - return 0; - - /* - * Lockdep should run with IRQs disabled, otherwise we could - * get an interrupt which would want to take locks, which would - * end up in lockdep and have you got a head-ache already? - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - - if (lock->key == &__lockdep_no_validate__) - check = 1; - - if (subclass < NR_LOCKDEP_CACHING_CLASSES) - class = lock->class_cache[subclass]; - /* - * Not cached? - */ - if (unlikely(!class)) { - class = register_lock_class(lock, subclass, 0); - if (!class) - return 0; - } - atomic_inc((atomic_t *)&class->ops); - if (very_verbose(class)) { - printk("\nacquire class [%p] %s", class->key, class->name); - if (class->name_version > 1) - printk("#%d", class->name_version); - printk("\n"); - dump_stack(); - } - - /* - * Add the lock to the list of currently held locks. - * (we dont increase the depth just yet, up until the - * dependency checks are done) - */ - depth = curr->lockdep_depth; - /* - * Ran out of static storage for our per-task lock stack again have we? - */ - if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) - return 0; - - class_idx = class - lock_classes + 1; - - if (depth) { - hlock = curr->held_locks + depth - 1; - if (hlock->class_idx == class_idx && nest_lock) { - if (hlock->references) - hlock->references++; - else - hlock->references = 2; - - return 1; - } - } - - hlock = curr->held_locks + depth; - /* - * Plain impossible, we just registered it and checked it weren't no - * NULL like.. I bet this mushroom I ate was good! - */ - if (DEBUG_LOCKS_WARN_ON(!class)) - return 0; - hlock->class_idx = class_idx; - hlock->acquire_ip = ip; - hlock->instance = lock; - hlock->nest_lock = nest_lock; - hlock->trylock = trylock; - hlock->read = read; - hlock->check = check; - hlock->hardirqs_off = !!hardirqs_off; - hlock->references = references; -#ifdef CONFIG_LOCK_STAT - hlock->waittime_stamp = 0; - hlock->holdtime_stamp = lockstat_clock(); -#endif - - if (check == 2 && !mark_irqflags(curr, hlock)) - return 0; - - /* mark it as used: */ - if (!mark_lock(curr, hlock, LOCK_USED)) - return 0; - - /* - * Calculate the chain hash: it's the combined hash of all the - * lock keys along the dependency chain. We save the hash value - * at every step so that we can get the current hash easily - * after unlock. The chain hash is then used to cache dependency - * results. - * - * The 'key ID' is what is the most compact key value to drive - * the hash, not class->key. - */ - id = class - lock_classes; - /* - * Whoops, we did it again.. ran straight out of our static allocation. - */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) - return 0; - - chain_key = curr->curr_chain_key; - if (!depth) { - /* - * How can we have a chain hash when we ain't got no keys?! - */ - if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) - return 0; - chain_head = 1; - } - - hlock->prev_chain_key = chain_key; - if (separate_irq_context(curr, hlock)) { - chain_key = 0; - chain_head = 1; - } - chain_key = iterate_chain_key(chain_key, id); - - if (nest_lock && !__lock_is_held(nest_lock)) - return print_lock_nested_lock_not_held(curr, hlock, ip); - - if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) - return 0; - - curr->curr_chain_key = chain_key; - curr->lockdep_depth++; - check_chain_key(curr); -#ifdef CONFIG_DEBUG_LOCKDEP - if (unlikely(!debug_locks)) - return 0; -#endif - if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { - debug_locks_off(); - print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); - printk(KERN_DEBUG "depth: %i max: %lu!\n", - curr->lockdep_depth, MAX_LOCK_DEPTH); - - lockdep_print_held_locks(current); - debug_show_all_locks(); - dump_stack(); - - return 0; - } - - if (unlikely(curr->lockdep_depth > max_lockdep_depth)) - max_lockdep_depth = curr->lockdep_depth; - - return 1; -} - -static int -print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) -{ - if (!debug_locks_off()) - return 0; - if (debug_locks_silent) - return 0; - - printk("\n"); - printk("=====================================\n"); - printk("[ BUG: bad unlock balance detected! ]\n"); - print_kernel_ident(); - printk("-------------------------------------\n"); - printk("%s/%d is trying to release lock (", - curr->comm, task_pid_nr(curr)); - print_lockdep_cache(lock); - printk(") at:\n"); - print_ip_sym(ip); - printk("but there are no more locks to release!\n"); - printk("\nother info that might help us debug this:\n"); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -/* - * Common debugging checks for both nested and non-nested unlock: - */ -static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) -{ - if (unlikely(!debug_locks)) - return 0; - /* - * Lockdep should run with IRQs disabled, recursion, head-ache, etc.. - */ - if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) - return 0; - - if (curr->lockdep_depth <= 0) - return print_unlock_imbalance_bug(curr, lock, ip); - - return 1; -} - -static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) -{ - if (hlock->instance == lock) - return 1; - - if (hlock->references) { - struct lock_class *class = lock->class_cache[0]; - - if (!class) - class = look_up_lock_class(lock, 0); - - /* - * If look_up_lock_class() failed to find a class, we're trying - * to test if we hold a lock that has never yet been acquired. - * Clearly if the lock hasn't been acquired _ever_, we're not - * holding it either, so report failure. - */ - if (!class) - return 0; - - /* - * References, but not a lock we're actually ref-counting? - * State got messed up, follow the sites that change ->references - * and try to make sense of it. - */ - if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) - return 0; - - if (hlock->class_idx == class - lock_classes + 1) - return 1; - } - - return 0; -} - -static int -__lock_set_class(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, unsigned int subclass, - unsigned long ip) -{ - struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; - struct lock_class *class; - unsigned int depth; - int i; - - depth = curr->lockdep_depth; - /* - * This function is about (re)setting the class of a held lock, - * yet we're not actually holding any locks. Naughty user! - */ - if (DEBUG_LOCKS_WARN_ON(!depth)) - return 0; - - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_imbalance_bug(curr, lock, ip); - -found_it: - lockdep_init_map(lock, name, key, 0); - class = register_lock_class(lock, subclass, 0); - hlock->class_idx = class - lock_classes + 1; - - curr->lockdep_depth = i; - curr->curr_chain_key = hlock->prev_chain_key; - - for (; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip, - hlock->references)) - return 0; - } - - /* - * I took it apart and put it back together again, except now I have - * these 'spare' parts.. where shall I put them. - */ - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) - return 0; - return 1; -} - -/* - * Remove the lock to the list of currently held locks in a - * potentially non-nested (out of order) manner. This is a - * relatively rare operation, as all the unlock APIs default - * to nested mode (which uses lock_release()): - */ -static int -lock_release_non_nested(struct task_struct *curr, - struct lockdep_map *lock, unsigned long ip) -{ - struct held_lock *hlock, *prev_hlock; - unsigned int depth; - int i; - - /* - * Check whether the lock exists in the current stack - * of held locks: - */ - depth = curr->lockdep_depth; - /* - * So we're all set to release this lock.. wait what lock? We don't - * own any locks, you've been drinking again? - */ - if (DEBUG_LOCKS_WARN_ON(!depth)) - return 0; - - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_imbalance_bug(curr, lock, ip); - -found_it: - if (hlock->instance == lock) - lock_release_holdtime(hlock); - - if (hlock->references) { - hlock->references--; - if (hlock->references) { - /* - * We had, and after removing one, still have - * references, the current lock stack is still - * valid. We're done! - */ - return 1; - } - } - - /* - * We have the right lock to unlock, 'hlock' points to it. - * Now we remove it from the stack, and add back the other - * entries (if any), recalculating the hash along the way: - */ - - curr->lockdep_depth = i; - curr->curr_chain_key = hlock->prev_chain_key; - - for (i++; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip, - hlock->references)) - return 0; - } - - /* - * We had N bottles of beer on the wall, we drank one, but now - * there's not N-1 bottles of beer left on the wall... - */ - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) - return 0; - return 1; -} - -/* - * Remove the lock to the list of currently held locks - this gets - * called on mutex_unlock()/spin_unlock*() (or on a failed - * mutex_lock_interruptible()). This is done for unlocks that nest - * perfectly. (i.e. the current top of the lock-stack is unlocked) - */ -static int lock_release_nested(struct task_struct *curr, - struct lockdep_map *lock, unsigned long ip) -{ - struct held_lock *hlock; - unsigned int depth; - - /* - * Pop off the top of the lock stack: - */ - depth = curr->lockdep_depth - 1; - hlock = curr->held_locks + depth; - - /* - * Is the unlock non-nested: - */ - if (hlock->instance != lock || hlock->references) - return lock_release_non_nested(curr, lock, ip); - curr->lockdep_depth--; - - /* - * No more locks, but somehow we've got hash left over, who left it? - */ - if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) - return 0; - - curr->curr_chain_key = hlock->prev_chain_key; - - lock_release_holdtime(hlock); - -#ifdef CONFIG_DEBUG_LOCKDEP - hlock->prev_chain_key = 0; - hlock->class_idx = 0; - hlock->acquire_ip = 0; - hlock->irq_context = 0; -#endif - return 1; -} - -/* - * Remove the lock to the list of currently held locks - this gets - * called on mutex_unlock()/spin_unlock*() (or on a failed - * mutex_lock_interruptible()). This is done for unlocks that nest - * perfectly. (i.e. the current top of the lock-stack is unlocked) - */ -static void -__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) -{ - struct task_struct *curr = current; - - if (!check_unlock(curr, lock, ip)) - return; - - if (nested) { - if (!lock_release_nested(curr, lock, ip)) - return; - } else { - if (!lock_release_non_nested(curr, lock, ip)) - return; - } - - check_chain_key(curr); -} - -static int __lock_is_held(struct lockdep_map *lock) -{ - struct task_struct *curr = current; - int i; - - for (i = 0; i < curr->lockdep_depth; i++) { - struct held_lock *hlock = curr->held_locks + i; - - if (match_held_lock(hlock, lock)) - return 1; - } - - return 0; -} - -/* - * Check whether we follow the irq-flags state precisely: - */ -static void check_flags(unsigned long flags) -{ -#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ - defined(CONFIG_TRACE_IRQFLAGS) - if (!debug_locks) - return; - - if (irqs_disabled_flags(flags)) { - if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { - printk("possible reason: unannotated irqs-off.\n"); - } - } else { - if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { - printk("possible reason: unannotated irqs-on.\n"); - } - } - - /* - * We dont accurately track softirq state in e.g. - * hardirq contexts (such as on 4KSTACKS), so only - * check if not in hardirq contexts: - */ - if (!hardirq_count()) { - if (softirq_count()) { - /* like the above, but with softirqs */ - DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); - } else { - /* lick the above, does it taste good? */ - DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); - } - } - - if (!debug_locks) - print_irqtrace_events(current); -#endif -} - -void lock_set_class(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, unsigned int subclass, - unsigned long ip) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - current->lockdep_recursion = 1; - check_flags(flags); - if (__lock_set_class(lock, name, key, subclass, ip)) - check_chain_key(current); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_set_class); - -/* - * We are not always called with irqs disabled - do that here, - * and also avoid lockdep recursion: - */ -void lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, - struct lockdep_map *nest_lock, unsigned long ip) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - - current->lockdep_recursion = 1; - trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); - __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip, 0); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_acquire); - -void lock_release(struct lockdep_map *lock, int nested, - unsigned long ip) -{ - unsigned long flags; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - trace_lock_release(lock, ip); - __lock_release(lock, nested, ip); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_release); - -int lock_is_held(struct lockdep_map *lock) -{ - unsigned long flags; - int ret = 0; - - if (unlikely(current->lockdep_recursion)) - return 1; /* avoid false negative lockdep_assert_held() */ - - raw_local_irq_save(flags); - check_flags(flags); - - current->lockdep_recursion = 1; - ret = __lock_is_held(lock); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); - - return ret; -} -EXPORT_SYMBOL_GPL(lock_is_held); - -void lockdep_set_current_reclaim_state(gfp_t gfp_mask) -{ - current->lockdep_reclaim_gfp = gfp_mask; -} - -void lockdep_clear_current_reclaim_state(void) -{ - current->lockdep_reclaim_gfp = 0; -} - -#ifdef CONFIG_LOCK_STAT -static int -print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) -{ - if (!debug_locks_off()) - return 0; - if (debug_locks_silent) - return 0; - - printk("\n"); - printk("=================================\n"); - printk("[ BUG: bad contention detected! ]\n"); - print_kernel_ident(); - printk("---------------------------------\n"); - printk("%s/%d is trying to contend lock (", - curr->comm, task_pid_nr(curr)); - print_lockdep_cache(lock); - printk(") at:\n"); - print_ip_sym(ip); - printk("but there are no locks held!\n"); - printk("\nother info that might help us debug this:\n"); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); - - return 0; -} - -static void -__lock_contended(struct lockdep_map *lock, unsigned long ip) -{ - struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; - struct lock_class_stats *stats; - unsigned int depth; - int i, contention_point, contending_point; - - depth = curr->lockdep_depth; - /* - * Whee, we contended on this lock, except it seems we're not - * actually trying to acquire anything much at all.. - */ - if (DEBUG_LOCKS_WARN_ON(!depth)) - return; - - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - print_lock_contention_bug(curr, lock, ip); - return; - -found_it: - if (hlock->instance != lock) - return; - - hlock->waittime_stamp = lockstat_clock(); - - contention_point = lock_point(hlock_class(hlock)->contention_point, ip); - contending_point = lock_point(hlock_class(hlock)->contending_point, - lock->ip); - - stats = get_lock_stats(hlock_class(hlock)); - if (contention_point < LOCKSTAT_POINTS) - stats->contention_point[contention_point]++; - if (contending_point < LOCKSTAT_POINTS) - stats->contending_point[contending_point]++; - if (lock->cpu != smp_processor_id()) - stats->bounces[bounce_contended + !!hlock->read]++; - put_lock_stats(stats); -} - -static void -__lock_acquired(struct lockdep_map *lock, unsigned long ip) -{ - struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; - struct lock_class_stats *stats; - unsigned int depth; - u64 now, waittime = 0; - int i, cpu; - - depth = curr->lockdep_depth; - /* - * Yay, we acquired ownership of this lock we didn't try to - * acquire, how the heck did that happen? - */ - if (DEBUG_LOCKS_WARN_ON(!depth)) - return; - - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - print_lock_contention_bug(curr, lock, _RET_IP_); - return; - -found_it: - if (hlock->instance != lock) - return; - - cpu = smp_processor_id(); - if (hlock->waittime_stamp) { - now = lockstat_clock(); - waittime = now - hlock->waittime_stamp; - hlock->holdtime_stamp = now; - } - - trace_lock_acquired(lock, ip); - - stats = get_lock_stats(hlock_class(hlock)); - if (waittime) { - if (hlock->read) - lock_time_inc(&stats->read_waittime, waittime); - else - lock_time_inc(&stats->write_waittime, waittime); - } - if (lock->cpu != cpu) - stats->bounces[bounce_acquired + !!hlock->read]++; - put_lock_stats(stats); - - lock->cpu = cpu; - lock->ip = ip; -} - -void lock_contended(struct lockdep_map *lock, unsigned long ip) -{ - unsigned long flags; - - if (unlikely(!lock_stat)) - return; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - trace_lock_contended(lock, ip); - __lock_contended(lock, ip); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_contended); - -void lock_acquired(struct lockdep_map *lock, unsigned long ip) -{ - unsigned long flags; - - if (unlikely(!lock_stat)) - return; - - if (unlikely(current->lockdep_recursion)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - __lock_acquired(lock, ip); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_acquired); -#endif - -/* - * Used by the testsuite, sanitize the validator state - * after a simulated failure: - */ - -void lockdep_reset(void) -{ - unsigned long flags; - int i; - - raw_local_irq_save(flags); - current->curr_chain_key = 0; - current->lockdep_depth = 0; - current->lockdep_recursion = 0; - memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); - nr_hardirq_chains = 0; - nr_softirq_chains = 0; - nr_process_chains = 0; - debug_locks = 1; - for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); - raw_local_irq_restore(flags); -} - -static void zap_class(struct lock_class *class) -{ - int i; - - /* - * Remove all dependencies this lock is - * involved in: - */ - for (i = 0; i < nr_list_entries; i++) { - if (list_entries[i].class == class) - list_del_rcu(&list_entries[i].entry); - } - /* - * Unhash the class and remove it from the all_lock_classes list: - */ - list_del_rcu(&class->hash_entry); - list_del_rcu(&class->lock_entry); - - class->key = NULL; -} - -static inline int within(const void *addr, void *start, unsigned long size) -{ - return addr >= start && addr < start + size; -} - -void lockdep_free_key_range(void *start, unsigned long size) -{ - struct lock_class *class, *next; - struct list_head *head; - unsigned long flags; - int i; - int locked; - - raw_local_irq_save(flags); - locked = graph_lock(); - - /* - * Unhash all classes that were created by this module: - */ - for (i = 0; i < CLASSHASH_SIZE; i++) { - head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_safe(class, next, head, hash_entry) { - if (within(class->key, start, size)) - zap_class(class); - else if (within(class->name, start, size)) - zap_class(class); - } - } - - if (locked) - graph_unlock(); - raw_local_irq_restore(flags); -} - -void lockdep_reset_lock(struct lockdep_map *lock) -{ - struct lock_class *class, *next; - struct list_head *head; - unsigned long flags; - int i, j; - int locked; - - raw_local_irq_save(flags); - - /* - * Remove all classes this lock might have: - */ - for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { - /* - * If the class exists we look it up and zap it: - */ - class = look_up_lock_class(lock, j); - if (class) - zap_class(class); - } - /* - * Debug check: in the end all mapped classes should - * be gone. - */ - locked = graph_lock(); - for (i = 0; i < CLASSHASH_SIZE; i++) { - head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_safe(class, next, head, hash_entry) { - int match = 0; - - for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) - match |= class == lock->class_cache[j]; - - if (unlikely(match)) { - if (debug_locks_off_graph_unlock()) { - /* - * We all just reset everything, how did it match? - */ - WARN_ON(1); - } - goto out_restore; - } - } - } - if (locked) - graph_unlock(); - -out_restore: - raw_local_irq_restore(flags); -} - -void lockdep_init(void) -{ - int i; - - /* - * Some architectures have their own start_kernel() - * code which calls lockdep_init(), while we also - * call lockdep_init() from the start_kernel() itself, - * and we want to initialize the hashes only once: - */ - if (lockdep_initialized) - return; - - for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_LIST_HEAD(classhash_table + i); - - for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); - - lockdep_initialized = 1; -} - -void __init lockdep_info(void) -{ - printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); - - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); - printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); - printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); - printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); - printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); - printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); - - printk(" memory used by lock dependency info: %lu kB\n", - (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + - sizeof(struct list_head) * CLASSHASH_SIZE + - sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + - sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE -#ifdef CONFIG_PROVE_LOCKING - + sizeof(struct circular_queue) -#endif - ) / 1024 - ); - - printk(" per task-struct memory footprint: %lu bytes\n", - sizeof(struct held_lock) * MAX_LOCK_DEPTH); - -#ifdef CONFIG_DEBUG_LOCKDEP - if (lockdep_init_error) { - printk("WARNING: lockdep init error! lock-%s was acquired" - "before lockdep_init\n", lock_init_error); - printk("Call stack leading to lockdep invocation was:\n"); - print_stack_trace(&lockdep_init_trace, 0); - } -#endif -} - -static void -print_freed_lock_bug(struct task_struct *curr, const void *mem_from, - const void *mem_to, struct held_lock *hlock) -{ - if (!debug_locks_off()) - return; - if (debug_locks_silent) - return; - - printk("\n"); - printk("=========================\n"); - printk("[ BUG: held lock freed! ]\n"); - print_kernel_ident(); - printk("-------------------------\n"); - printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", - curr->comm, task_pid_nr(curr), mem_from, mem_to-1); - print_lock(hlock); - lockdep_print_held_locks(curr); - - printk("\nstack backtrace:\n"); - dump_stack(); -} - -static inline int not_in_range(const void* mem_from, unsigned long mem_len, - const void* lock_from, unsigned long lock_len) -{ - return lock_from + lock_len <= mem_from || - mem_from + mem_len <= lock_from; -} - -/* - * Called when kernel memory is freed (or unmapped), or if a lock - * is destroyed or reinitialized - this code checks whether there is - * any held lock in the memory range of to : - */ -void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) -{ - struct task_struct *curr = current; - struct held_lock *hlock; - unsigned long flags; - int i; - - if (unlikely(!debug_locks)) - return; - - local_irq_save(flags); - for (i = 0; i < curr->lockdep_depth; i++) { - hlock = curr->held_locks + i; - - if (not_in_range(mem_from, mem_len, hlock->instance, - sizeof(*hlock->instance))) - continue; - - print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock); - break; - } - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); - -static void print_held_locks_bug(void) -{ - if (!debug_locks_off()) - return; - if (debug_locks_silent) - return; - - printk("\n"); - printk("=====================================\n"); - printk("[ BUG: %s/%d still has locks held! ]\n", - current->comm, task_pid_nr(current)); - print_kernel_ident(); - printk("-------------------------------------\n"); - lockdep_print_held_locks(current); - printk("\nstack backtrace:\n"); - dump_stack(); -} - -void debug_check_no_locks_held(void) -{ - if (unlikely(current->lockdep_depth > 0)) - print_held_locks_bug(); -} -EXPORT_SYMBOL_GPL(debug_check_no_locks_held); - -void debug_show_all_locks(void) -{ - struct task_struct *g, *p; - int count = 10; - int unlock = 1; - - if (unlikely(!debug_locks)) { - printk("INFO: lockdep is turned off.\n"); - return; - } - printk("\nShowing all locks held in the system:\n"); - - /* - * Here we try to get the tasklist_lock as hard as possible, - * if not successful after 2 seconds we ignore it (but keep - * trying). This is to enable a debug printout even if a - * tasklist_lock-holding task deadlocks or crashes. - */ -retry: - if (!read_trylock(&tasklist_lock)) { - if (count == 10) - printk("hm, tasklist_lock locked, retrying... "); - if (count) { - count--; - printk(" #%d", 10-count); - mdelay(200); - goto retry; - } - printk(" ignoring it.\n"); - unlock = 0; - } else { - if (count != 10) - printk(KERN_CONT " locked it.\n"); - } - - do_each_thread(g, p) { - /* - * It's not reliable to print a task's held locks - * if it's not sleeping (or if it's not the current - * task): - */ - if (p->state == TASK_RUNNING && p != current) - continue; - if (p->lockdep_depth) - lockdep_print_held_locks(p); - if (!unlock) - if (read_trylock(&tasklist_lock)) - unlock = 1; - } while_each_thread(g, p); - - printk("\n"); - printk("=============================================\n\n"); - - if (unlock) - read_unlock(&tasklist_lock); -} -EXPORT_SYMBOL_GPL(debug_show_all_locks); - -/* - * Careful: only use this function if you are sure that - * the task cannot run in parallel! - */ -void debug_show_held_locks(struct task_struct *task) -{ - if (unlikely(!debug_locks)) { - printk("INFO: lockdep is turned off.\n"); - return; - } - lockdep_print_held_locks(task); -} -EXPORT_SYMBOL_GPL(debug_show_held_locks); - -void lockdep_sys_exit(void) -{ - struct task_struct *curr = current; - - if (unlikely(curr->lockdep_depth)) { - if (!debug_locks_off()) - return; - printk("\n"); - printk("================================================\n"); - printk("[ BUG: lock held when returning to user space! ]\n"); - print_kernel_ident(); - printk("------------------------------------------------\n"); - printk("%s/%d is leaving the kernel with locks still held!\n", - curr->comm, curr->pid); - lockdep_print_held_locks(curr); - } -} - -void lockdep_rcu_suspicious(const char *file, const int line, const char *s) -{ - struct task_struct *curr = current; - -#ifndef CONFIG_PROVE_RCU_REPEATEDLY - if (!debug_locks_off()) - return; -#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ - /* Note: the following can be executed concurrently, so be careful. */ - printk("\n"); - printk("===============================\n"); - printk("[ INFO: suspicious RCU usage. ]\n"); - print_kernel_ident(); - printk("-------------------------------\n"); - printk("%s:%d %s!\n", file, line, s); - printk("\nother info that might help us debug this:\n\n"); - printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", - !rcu_lockdep_current_cpu_online() - ? "RCU used illegally from offline CPU!\n" - : !rcu_is_watching() - ? "RCU used illegally from idle CPU!\n" - : "", - rcu_scheduler_active, debug_locks); - - /* - * If a CPU is in the RCU-free window in idle (ie: in the section - * between rcu_idle_enter() and rcu_idle_exit(), then RCU - * considers that CPU to be in an "extended quiescent state", - * which means that RCU will be completely ignoring that CPU. - * Therefore, rcu_read_lock() and friends have absolutely no - * effect on a CPU running in that state. In other words, even if - * such an RCU-idle CPU has called rcu_read_lock(), RCU might well - * delete data structures out from under it. RCU really has no - * choice here: we need to keep an RCU-free window in idle where - * the CPU may possibly enter into low power mode. This way we can - * notice an extended quiescent state to other CPUs that started a grace - * period. Otherwise we would delay any grace period as long as we run - * in the idle task. - * - * So complain bitterly if someone does call rcu_read_lock(), - * rcu_read_lock_bh() and so on from extended quiescent states. - */ - if (!rcu_is_watching()) - printk("RCU used illegally from extended quiescent state!\n"); - - lockdep_print_held_locks(curr); - printk("\nstack backtrace:\n"); - dump_stack(); -} -EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h deleted file mode 100644 index 4f560cfedc8f..000000000000 --- a/kernel/lockdep_internals.h +++ /dev/null @@ -1,170 +0,0 @@ -/* - * kernel/lockdep_internals.h - * - * Runtime locking correctness validator - * - * lockdep subsystem internal functions and variables. - */ - -/* - * Lock-class usage-state bits: - */ -enum lock_usage_bit { -#define LOCKDEP_STATE(__STATE) \ - LOCK_USED_IN_##__STATE, \ - LOCK_USED_IN_##__STATE##_READ, \ - LOCK_ENABLED_##__STATE, \ - LOCK_ENABLED_##__STATE##_READ, -#include "lockdep_states.h" -#undef LOCKDEP_STATE - LOCK_USED, - LOCK_USAGE_STATES -}; - -/* - * Usage-state bitmasks: - */ -#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE), - -enum { -#define LOCKDEP_STATE(__STATE) \ - __LOCKF(USED_IN_##__STATE) \ - __LOCKF(USED_IN_##__STATE##_READ) \ - __LOCKF(ENABLED_##__STATE) \ - __LOCKF(ENABLED_##__STATE##_READ) -#include "lockdep_states.h" -#undef LOCKDEP_STATE - __LOCKF(USED) -}; - -#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) -#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) - -#define LOCKF_ENABLED_IRQ_READ \ - (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) -#define LOCKF_USED_IN_IRQ_READ \ - (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) - -/* - * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies - * we track. - * - * We use the per-lock dependency maps in two ways: we grow it by adding - * every to-be-taken lock to all currently held lock's own dependency - * table (if it's not there yet), and we check it for lock order - * conflicts and deadlocks. - */ -#define MAX_LOCKDEP_ENTRIES 16384UL - -#define MAX_LOCKDEP_CHAINS_BITS 15 -#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) - -#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) - -/* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the hash_lock. - */ -#define MAX_STACK_TRACE_ENTRIES 262144UL - -extern struct list_head all_lock_classes; -extern struct lock_chain lock_chains[]; - -#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) - -extern void get_usage_chars(struct lock_class *class, - char usage[LOCK_USAGE_CHARS]); - -extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); - -struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); - -extern unsigned long nr_lock_classes; -extern unsigned long nr_list_entries; -extern unsigned long nr_lock_chains; -extern int nr_chain_hlocks; -extern unsigned long nr_stack_trace_entries; - -extern unsigned int nr_hardirq_chains; -extern unsigned int nr_softirq_chains; -extern unsigned int nr_process_chains; -extern unsigned int max_lockdep_depth; -extern unsigned int max_recursion_depth; - -extern unsigned int max_bfs_queue_depth; - -#ifdef CONFIG_PROVE_LOCKING -extern unsigned long lockdep_count_forward_deps(struct lock_class *); -extern unsigned long lockdep_count_backward_deps(struct lock_class *); -#else -static inline unsigned long -lockdep_count_forward_deps(struct lock_class *class) -{ - return 0; -} -static inline unsigned long -lockdep_count_backward_deps(struct lock_class *class) -{ - return 0; -} -#endif - -#ifdef CONFIG_DEBUG_LOCKDEP - -#include -/* - * Various lockdep statistics. - * We want them per cpu as they are often accessed in fast path - * and we want to avoid too much cache bouncing. - */ -struct lockdep_stats { - int chain_lookup_hits; - int chain_lookup_misses; - int hardirqs_on_events; - int hardirqs_off_events; - int redundant_hardirqs_on; - int redundant_hardirqs_off; - int softirqs_on_events; - int softirqs_off_events; - int redundant_softirqs_on; - int redundant_softirqs_off; - int nr_unused_locks; - int nr_cyclic_checks; - int nr_cyclic_check_recursions; - int nr_find_usage_forwards_checks; - int nr_find_usage_forwards_recursions; - int nr_find_usage_backwards_checks; - int nr_find_usage_backwards_recursions; -}; - -DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); - -#define __debug_atomic_inc(ptr) \ - this_cpu_inc(lockdep_stats.ptr); - -#define debug_atomic_inc(ptr) { \ - WARN_ON_ONCE(!irqs_disabled()); \ - __this_cpu_inc(lockdep_stats.ptr); \ -} - -#define debug_atomic_dec(ptr) { \ - WARN_ON_ONCE(!irqs_disabled()); \ - __this_cpu_dec(lockdep_stats.ptr); \ -} - -#define debug_atomic_read(ptr) ({ \ - struct lockdep_stats *__cpu_lockdep_stats; \ - unsigned long long __total = 0; \ - int __cpu; \ - for_each_possible_cpu(__cpu) { \ - __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \ - __total += __cpu_lockdep_stats->ptr; \ - } \ - __total; \ -}) -#else -# define __debug_atomic_inc(ptr) do { } while (0) -# define debug_atomic_inc(ptr) do { } while (0) -# define debug_atomic_dec(ptr) do { } while (0) -# define debug_atomic_read(ptr) 0 -#endif diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c deleted file mode 100644 index 09220656d888..000000000000 --- a/kernel/lockdep_proc.c +++ /dev/null @@ -1,683 +0,0 @@ -/* - * kernel/lockdep_proc.c - * - * Runtime locking correctness validator - * - * Started by Ingo Molnar: - * - * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra - * - * Code for /proc/lockdep and /proc/lockdep_stats: - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "lockdep_internals.h" - -static void *l_next(struct seq_file *m, void *v, loff_t *pos) -{ - return seq_list_next(v, &all_lock_classes, pos); -} - -static void *l_start(struct seq_file *m, loff_t *pos) -{ - return seq_list_start_head(&all_lock_classes, *pos); -} - -static void l_stop(struct seq_file *m, void *v) -{ -} - -static void print_name(struct seq_file *m, struct lock_class *class) -{ - char str[KSYM_NAME_LEN]; - const char *name = class->name; - - if (!name) { - name = __get_key_name(class->key, str); - seq_printf(m, "%s", name); - } else{ - seq_printf(m, "%s", name); - if (class->name_version > 1) - seq_printf(m, "#%d", class->name_version); - if (class->subclass) - seq_printf(m, "/%d", class->subclass); - } -} - -static int l_show(struct seq_file *m, void *v) -{ - struct lock_class *class = list_entry(v, struct lock_class, lock_entry); - struct lock_list *entry; - char usage[LOCK_USAGE_CHARS]; - - if (v == &all_lock_classes) { - seq_printf(m, "all lock classes:\n"); - return 0; - } - - seq_printf(m, "%p", class->key); -#ifdef CONFIG_DEBUG_LOCKDEP - seq_printf(m, " OPS:%8ld", class->ops); -#endif -#ifdef CONFIG_PROVE_LOCKING - seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); - seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); -#endif - - get_usage_chars(class, usage); - seq_printf(m, " %s", usage); - - seq_printf(m, ": "); - print_name(m, class); - seq_puts(m, "\n"); - - list_for_each_entry(entry, &class->locks_after, entry) { - if (entry->distance == 1) { - seq_printf(m, " -> [%p] ", entry->class->key); - print_name(m, entry->class); - seq_puts(m, "\n"); - } - } - seq_puts(m, "\n"); - - return 0; -} - -static const struct seq_operations lockdep_ops = { - .start = l_start, - .next = l_next, - .stop = l_stop, - .show = l_show, -}; - -static int lockdep_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_ops); -} - -static const struct file_operations proc_lockdep_operations = { - .open = lockdep_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#ifdef CONFIG_PROVE_LOCKING -static void *lc_start(struct seq_file *m, loff_t *pos) -{ - if (*pos == 0) - return SEQ_START_TOKEN; - - if (*pos - 1 < nr_lock_chains) - return lock_chains + (*pos - 1); - - return NULL; -} - -static void *lc_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return lc_start(m, pos); -} - -static void lc_stop(struct seq_file *m, void *v) -{ -} - -static int lc_show(struct seq_file *m, void *v) -{ - struct lock_chain *chain = v; - struct lock_class *class; - int i; - - if (v == SEQ_START_TOKEN) { - seq_printf(m, "all lock chains:\n"); - return 0; - } - - seq_printf(m, "irq_context: %d\n", chain->irq_context); - - for (i = 0; i < chain->depth; i++) { - class = lock_chain_get_class(chain, i); - if (!class->key) - continue; - - seq_printf(m, "[%p] ", class->key); - print_name(m, class); - seq_puts(m, "\n"); - } - seq_puts(m, "\n"); - - return 0; -} - -static const struct seq_operations lockdep_chains_ops = { - .start = lc_start, - .next = lc_next, - .stop = lc_stop, - .show = lc_show, -}; - -static int lockdep_chains_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &lockdep_chains_ops); -} - -static const struct file_operations proc_lockdep_chains_operations = { - .open = lockdep_chains_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif /* CONFIG_PROVE_LOCKING */ - -static void lockdep_stats_debug_show(struct seq_file *m) -{ -#ifdef CONFIG_DEBUG_LOCKDEP - unsigned long long hi1 = debug_atomic_read(hardirqs_on_events), - hi2 = debug_atomic_read(hardirqs_off_events), - hr1 = debug_atomic_read(redundant_hardirqs_on), - hr2 = debug_atomic_read(redundant_hardirqs_off), - si1 = debug_atomic_read(softirqs_on_events), - si2 = debug_atomic_read(softirqs_off_events), - sr1 = debug_atomic_read(redundant_softirqs_on), - sr2 = debug_atomic_read(redundant_softirqs_off); - - seq_printf(m, " chain lookup misses: %11llu\n", - debug_atomic_read(chain_lookup_misses)); - seq_printf(m, " chain lookup hits: %11llu\n", - debug_atomic_read(chain_lookup_hits)); - seq_printf(m, " cyclic checks: %11llu\n", - debug_atomic_read(nr_cyclic_checks)); - seq_printf(m, " find-mask forwards checks: %11llu\n", - debug_atomic_read(nr_find_usage_forwards_checks)); - seq_printf(m, " find-mask backwards checks: %11llu\n", - debug_atomic_read(nr_find_usage_backwards_checks)); - - seq_printf(m, " hardirq on events: %11llu\n", hi1); - seq_printf(m, " hardirq off events: %11llu\n", hi2); - seq_printf(m, " redundant hardirq ons: %11llu\n", hr1); - seq_printf(m, " redundant hardirq offs: %11llu\n", hr2); - seq_printf(m, " softirq on events: %11llu\n", si1); - seq_printf(m, " softirq off events: %11llu\n", si2); - seq_printf(m, " redundant softirq ons: %11llu\n", sr1); - seq_printf(m, " redundant softirq offs: %11llu\n", sr2); -#endif -} - -static int lockdep_stats_show(struct seq_file *m, void *v) -{ - struct lock_class *class; - unsigned long nr_unused = 0, nr_uncategorized = 0, - nr_irq_safe = 0, nr_irq_unsafe = 0, - nr_softirq_safe = 0, nr_softirq_unsafe = 0, - nr_hardirq_safe = 0, nr_hardirq_unsafe = 0, - nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, - nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, - nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, - sum_forward_deps = 0; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - - if (class->usage_mask == 0) - nr_unused++; - if (class->usage_mask == LOCKF_USED) - nr_uncategorized++; - if (class->usage_mask & LOCKF_USED_IN_IRQ) - nr_irq_safe++; - if (class->usage_mask & LOCKF_ENABLED_IRQ) - nr_irq_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) - nr_softirq_safe++; - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ) - nr_softirq_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) - nr_hardirq_safe++; - if (class->usage_mask & LOCKF_ENABLED_HARDIRQ) - nr_hardirq_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) - nr_irq_read_safe++; - if (class->usage_mask & LOCKF_ENABLED_IRQ_READ) - nr_irq_read_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) - nr_softirq_read_safe++; - if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ) - nr_softirq_read_unsafe++; - if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) - nr_hardirq_read_safe++; - if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) - nr_hardirq_read_unsafe++; - -#ifdef CONFIG_PROVE_LOCKING - sum_forward_deps += lockdep_count_forward_deps(class); -#endif - } -#ifdef CONFIG_DEBUG_LOCKDEP - DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); -#endif - seq_printf(m, " lock-classes: %11lu [max: %lu]\n", - nr_lock_classes, MAX_LOCKDEP_KEYS); - seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", - nr_list_entries, MAX_LOCKDEP_ENTRIES); - seq_printf(m, " indirect dependencies: %11lu\n", - sum_forward_deps); - - /* - * Total number of dependencies: - * - * All irq-safe locks may nest inside irq-unsafe locks, - * plus all the other known dependencies: - */ - seq_printf(m, " all direct dependencies: %11lu\n", - nr_irq_unsafe * nr_irq_safe + - nr_hardirq_unsafe * nr_hardirq_safe + - nr_list_entries); - -#ifdef CONFIG_PROVE_LOCKING - seq_printf(m, " dependency chains: %11lu [max: %lu]\n", - nr_lock_chains, MAX_LOCKDEP_CHAINS); - seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", - nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); -#endif - -#ifdef CONFIG_TRACE_IRQFLAGS - seq_printf(m, " in-hardirq chains: %11u\n", - nr_hardirq_chains); - seq_printf(m, " in-softirq chains: %11u\n", - nr_softirq_chains); -#endif - seq_printf(m, " in-process chains: %11u\n", - nr_process_chains); - seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", - nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); - seq_printf(m, " combined max dependencies: %11u\n", - (nr_hardirq_chains + 1) * - (nr_softirq_chains + 1) * - (nr_process_chains + 1) - ); - seq_printf(m, " hardirq-safe locks: %11lu\n", - nr_hardirq_safe); - seq_printf(m, " hardirq-unsafe locks: %11lu\n", - nr_hardirq_unsafe); - seq_printf(m, " softirq-safe locks: %11lu\n", - nr_softirq_safe); - seq_printf(m, " softirq-unsafe locks: %11lu\n", - nr_softirq_unsafe); - seq_printf(m, " irq-safe locks: %11lu\n", - nr_irq_safe); - seq_printf(m, " irq-unsafe locks: %11lu\n", - nr_irq_unsafe); - - seq_printf(m, " hardirq-read-safe locks: %11lu\n", - nr_hardirq_read_safe); - seq_printf(m, " hardirq-read-unsafe locks: %11lu\n", - nr_hardirq_read_unsafe); - seq_printf(m, " softirq-read-safe locks: %11lu\n", - nr_softirq_read_safe); - seq_printf(m, " softirq-read-unsafe locks: %11lu\n", - nr_softirq_read_unsafe); - seq_printf(m, " irq-read-safe locks: %11lu\n", - nr_irq_read_safe); - seq_printf(m, " irq-read-unsafe locks: %11lu\n", - nr_irq_read_unsafe); - - seq_printf(m, " uncategorized locks: %11lu\n", - nr_uncategorized); - seq_printf(m, " unused locks: %11lu\n", - nr_unused); - seq_printf(m, " max locking depth: %11u\n", - max_lockdep_depth); -#ifdef CONFIG_PROVE_LOCKING - seq_printf(m, " max bfs queue depth: %11u\n", - max_bfs_queue_depth); -#endif - lockdep_stats_debug_show(m); - seq_printf(m, " debug_locks: %11u\n", - debug_locks); - - return 0; -} - -static int lockdep_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, lockdep_stats_show, NULL); -} - -static const struct file_operations proc_lockdep_stats_operations = { - .open = lockdep_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -#ifdef CONFIG_LOCK_STAT - -struct lock_stat_data { - struct lock_class *class; - struct lock_class_stats stats; -}; - -struct lock_stat_seq { - struct lock_stat_data *iter_end; - struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; -}; - -/* - * sort on absolute number of contentions - */ -static int lock_stat_cmp(const void *l, const void *r) -{ - const struct lock_stat_data *dl = l, *dr = r; - unsigned long nl, nr; - - nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr; - nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr; - - return nr - nl; -} - -static void seq_line(struct seq_file *m, char c, int offset, int length) -{ - int i; - - for (i = 0; i < offset; i++) - seq_puts(m, " "); - for (i = 0; i < length; i++) - seq_printf(m, "%c", c); - seq_puts(m, "\n"); -} - -static void snprint_time(char *buf, size_t bufsiz, s64 nr) -{ - s64 div; - s32 rem; - - nr += 5; /* for display rounding */ - div = div_s64_rem(nr, 1000, &rem); - snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10); -} - -static void seq_time(struct seq_file *m, s64 time) -{ - char num[15]; - - snprint_time(num, sizeof(num), time); - seq_printf(m, " %14s", num); -} - -static void seq_lock_time(struct seq_file *m, struct lock_time *lt) -{ - seq_printf(m, "%14lu", lt->nr); - seq_time(m, lt->min); - seq_time(m, lt->max); - seq_time(m, lt->total); - seq_time(m, lt->nr ? do_div(lt->total, lt->nr) : 0); -} - -static void seq_stats(struct seq_file *m, struct lock_stat_data *data) -{ - char name[39]; - struct lock_class *class; - struct lock_class_stats *stats; - int i, namelen; - - class = data->class; - stats = &data->stats; - - namelen = 38; - if (class->name_version > 1) - namelen -= 2; /* XXX truncates versions > 9 */ - if (class->subclass) - namelen -= 2; - - if (!class->name) { - char str[KSYM_NAME_LEN]; - const char *key_name; - - key_name = __get_key_name(class->key, str); - snprintf(name, namelen, "%s", key_name); - } else { - snprintf(name, namelen, "%s", class->name); - } - namelen = strlen(name); - if (class->name_version > 1) { - snprintf(name+namelen, 3, "#%d", class->name_version); - namelen += 2; - } - if (class->subclass) { - snprintf(name+namelen, 3, "/%d", class->subclass); - namelen += 2; - } - - if (stats->write_holdtime.nr) { - if (stats->read_holdtime.nr) - seq_printf(m, "%38s-W:", name); - else - seq_printf(m, "%40s:", name); - - seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]); - seq_lock_time(m, &stats->write_waittime); - seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]); - seq_lock_time(m, &stats->write_holdtime); - seq_puts(m, "\n"); - } - - if (stats->read_holdtime.nr) { - seq_printf(m, "%38s-R:", name); - seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]); - seq_lock_time(m, &stats->read_waittime); - seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]); - seq_lock_time(m, &stats->read_holdtime); - seq_puts(m, "\n"); - } - - if (stats->read_waittime.nr + stats->write_waittime.nr == 0) - return; - - if (stats->read_holdtime.nr) - namelen += 2; - - for (i = 0; i < LOCKSTAT_POINTS; i++) { - char ip[32]; - - if (class->contention_point[i] == 0) - break; - - if (!i) - seq_line(m, '-', 40-namelen, namelen); - - snprintf(ip, sizeof(ip), "[<%p>]", - (void *)class->contention_point[i]); - seq_printf(m, "%40s %14lu %29s %pS\n", - name, stats->contention_point[i], - ip, (void *)class->contention_point[i]); - } - for (i = 0; i < LOCKSTAT_POINTS; i++) { - char ip[32]; - - if (class->contending_point[i] == 0) - break; - - if (!i) - seq_line(m, '-', 40-namelen, namelen); - - snprintf(ip, sizeof(ip), "[<%p>]", - (void *)class->contending_point[i]); - seq_printf(m, "%40s %14lu %29s %pS\n", - name, stats->contending_point[i], - ip, (void *)class->contending_point[i]); - } - if (i) { - seq_puts(m, "\n"); - seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1)); - seq_puts(m, "\n"); - } -} - -static void seq_header(struct seq_file *m) -{ - seq_puts(m, "lock_stat version 0.4\n"); - - if (unlikely(!debug_locks)) - seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); - - seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); - seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s " - "%14s %14s\n", - "class name", - "con-bounces", - "contentions", - "waittime-min", - "waittime-max", - "waittime-total", - "waittime-avg", - "acq-bounces", - "acquisitions", - "holdtime-min", - "holdtime-max", - "holdtime-total", - "holdtime-avg"); - seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); - seq_printf(m, "\n"); -} - -static void *ls_start(struct seq_file *m, loff_t *pos) -{ - struct lock_stat_seq *data = m->private; - struct lock_stat_data *iter; - - if (*pos == 0) - return SEQ_START_TOKEN; - - iter = data->stats + (*pos - 1); - if (iter >= data->iter_end) - iter = NULL; - - return iter; -} - -static void *ls_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return ls_start(m, pos); -} - -static void ls_stop(struct seq_file *m, void *v) -{ -} - -static int ls_show(struct seq_file *m, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_header(m); - else - seq_stats(m, v); - - return 0; -} - -static const struct seq_operations lockstat_ops = { - .start = ls_start, - .next = ls_next, - .stop = ls_stop, - .show = ls_show, -}; - -static int lock_stat_open(struct inode *inode, struct file *file) -{ - int res; - struct lock_class *class; - struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq)); - - if (!data) - return -ENOMEM; - - res = seq_open(file, &lockstat_ops); - if (!res) { - struct lock_stat_data *iter = data->stats; - struct seq_file *m = file->private_data; - - list_for_each_entry(class, &all_lock_classes, lock_entry) { - iter->class = class; - iter->stats = lock_stats(class); - iter++; - } - data->iter_end = iter; - - sort(data->stats, data->iter_end - data->stats, - sizeof(struct lock_stat_data), - lock_stat_cmp, NULL); - - m->private = data; - } else - vfree(data); - - return res; -} - -static ssize_t lock_stat_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct lock_class *class; - char c; - - if (count) { - if (get_user(c, buf)) - return -EFAULT; - - if (c != '0') - return count; - - list_for_each_entry(class, &all_lock_classes, lock_entry) - clear_lock_stats(class); - } - return count; -} - -static int lock_stat_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - - vfree(seq->private); - return seq_release(inode, file); -} - -static const struct file_operations proc_lock_stat_operations = { - .open = lock_stat_open, - .write = lock_stat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = lock_stat_release, -}; -#endif /* CONFIG_LOCK_STAT */ - -static int __init lockdep_proc_init(void) -{ - proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); -#ifdef CONFIG_PROVE_LOCKING - proc_create("lockdep_chains", S_IRUSR, NULL, - &proc_lockdep_chains_operations); -#endif - proc_create("lockdep_stats", S_IRUSR, NULL, - &proc_lockdep_stats_operations); - -#ifdef CONFIG_LOCK_STAT - proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, - &proc_lock_stat_operations); -#endif - - return 0; -} - -__initcall(lockdep_proc_init); - diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h deleted file mode 100644 index 995b0cc2b84c..000000000000 --- a/kernel/lockdep_states.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Lockdep states, - * - * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever - * you add one, or come up with a nice dynamic solution. - */ -LOCKDEP_STATE(HARDIRQ) -LOCKDEP_STATE(SOFTIRQ) -LOCKDEP_STATE(RECLAIM_FS) diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index fe8bd58b22f8..c103599fc1ba 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -2,8 +2,14 @@ obj-y += mutex.o ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg endif obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +obj-$(CONFIG_LOCKDEP) += lockdep.o +ifeq ($(CONFIG_PROC_FS),y) +obj-$(CONFIG_LOCKDEP) += lockdep_proc.o +endif diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c new file mode 100644 index 000000000000..4e8e14c34e42 --- /dev/null +++ b/kernel/locking/lockdep.c @@ -0,0 +1,4257 @@ +/* + * kernel/lockdep.c + * + * Runtime locking correctness validator + * + * Started by Ingo Molnar: + * + * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * this code maps all the lock dependencies as they occur in a live kernel + * and will warn about the following classes of locking bugs: + * + * - lock inversion scenarios + * - circular lock dependencies + * - hardirq/softirq safe/unsafe locking bugs + * + * Bugs are reported even if the current locking scenario does not cause + * any deadlock at this point. + * + * I.e. if anytime in the past two locks were taken in a different order, + * even if it happened for another task, even if those were different + * locks (but of the same class as this lock), this code will detect it. + * + * Thanks to Arjan van de Ven for coming up with the initial idea of + * mapping lock dependencies runtime. + */ +#define DISABLE_BRANCH_PROFILING +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "lockdep_internals.h" + +#define CREATE_TRACE_POINTS +#include + +#ifdef CONFIG_PROVE_LOCKING +int prove_locking = 1; +module_param(prove_locking, int, 0644); +#else +#define prove_locking 0 +#endif + +#ifdef CONFIG_LOCK_STAT +int lock_stat = 1; +module_param(lock_stat, int, 0644); +#else +#define lock_stat 0 +#endif + +/* + * lockdep_lock: protects the lockdep graph, the hashes and the + * class/list/hash allocators. + * + * This is one of the rare exceptions where it's justified + * to use a raw spinlock - we really dont want the spinlock + * code to recurse back into the lockdep code... + */ +static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +static int graph_lock(void) +{ + arch_spin_lock(&lockdep_lock); + /* + * Make sure that if another CPU detected a bug while + * walking the graph we dont change it (while the other + * CPU is busy printing out stuff with the graph lock + * dropped already) + */ + if (!debug_locks) { + arch_spin_unlock(&lockdep_lock); + return 0; + } + /* prevent any recursions within lockdep from causing deadlocks */ + current->lockdep_recursion++; + return 1; +} + +static inline int graph_unlock(void) +{ + if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { + /* + * The lockdep graph lock isn't locked while we expect it to + * be, we're confused now, bye! + */ + return DEBUG_LOCKS_WARN_ON(1); + } + + current->lockdep_recursion--; + arch_spin_unlock(&lockdep_lock); + return 0; +} + +/* + * Turn lock debugging off and return with 0 if it was off already, + * and also release the graph lock: + */ +static inline int debug_locks_off_graph_unlock(void) +{ + int ret = debug_locks_off(); + + arch_spin_unlock(&lockdep_lock); + + return ret; +} + +static int lockdep_initialized; + +unsigned long nr_list_entries; +static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; + +/* + * All data structures here are protected by the global debug_lock. + * + * Mutex key structs only get allocated, once during bootup, and never + * get freed - this significantly simplifies the debugging code. + */ +unsigned long nr_lock_classes; +static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; + +static inline struct lock_class *hlock_class(struct held_lock *hlock) +{ + if (!hlock->class_idx) { + /* + * Someone passed in garbage, we give up. + */ + DEBUG_LOCKS_WARN_ON(1); + return NULL; + } + return lock_classes + hlock->class_idx - 1; +} + +#ifdef CONFIG_LOCK_STAT +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], + cpu_lock_stats); + +static inline u64 lockstat_clock(void) +{ + return local_clock(); +} + +static int lock_point(unsigned long points[], unsigned long ip) +{ + int i; + + for (i = 0; i < LOCKSTAT_POINTS; i++) { + if (points[i] == 0) { + points[i] = ip; + break; + } + if (points[i] == ip) + break; + } + + return i; +} + +static void lock_time_inc(struct lock_time *lt, u64 time) +{ + if (time > lt->max) + lt->max = time; + + if (time < lt->min || !lt->nr) + lt->min = time; + + lt->total += time; + lt->nr++; +} + +static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) +{ + if (!src->nr) + return; + + if (src->max > dst->max) + dst->max = src->max; + + if (src->min < dst->min || !dst->nr) + dst->min = src->min; + + dst->total += src->total; + dst->nr += src->nr; +} + +struct lock_class_stats lock_stats(struct lock_class *class) +{ + struct lock_class_stats stats; + int cpu, i; + + memset(&stats, 0, sizeof(struct lock_class_stats)); + for_each_possible_cpu(cpu) { + struct lock_class_stats *pcs = + &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; + + for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) + stats.contention_point[i] += pcs->contention_point[i]; + + for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) + stats.contending_point[i] += pcs->contending_point[i]; + + lock_time_add(&pcs->read_waittime, &stats.read_waittime); + lock_time_add(&pcs->write_waittime, &stats.write_waittime); + + lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); + lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + + for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) + stats.bounces[i] += pcs->bounces[i]; + } + + return stats; +} + +void clear_lock_stats(struct lock_class *class) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct lock_class_stats *cpu_stats = + &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; + + memset(cpu_stats, 0, sizeof(struct lock_class_stats)); + } + memset(class->contention_point, 0, sizeof(class->contention_point)); + memset(class->contending_point, 0, sizeof(class->contending_point)); +} + +static struct lock_class_stats *get_lock_stats(struct lock_class *class) +{ + return &get_cpu_var(cpu_lock_stats)[class - lock_classes]; +} + +static void put_lock_stats(struct lock_class_stats *stats) +{ + put_cpu_var(cpu_lock_stats); +} + +static void lock_release_holdtime(struct held_lock *hlock) +{ + struct lock_class_stats *stats; + u64 holdtime; + + if (!lock_stat) + return; + + holdtime = lockstat_clock() - hlock->holdtime_stamp; + + stats = get_lock_stats(hlock_class(hlock)); + if (hlock->read) + lock_time_inc(&stats->read_holdtime, holdtime); + else + lock_time_inc(&stats->write_holdtime, holdtime); + put_lock_stats(stats); +} +#else +static inline void lock_release_holdtime(struct held_lock *hlock) +{ +} +#endif + +/* + * We keep a global list of all lock classes. The list only grows, + * never shrinks. The list is only accessed with the lockdep + * spinlock lock held. + */ +LIST_HEAD(all_lock_classes); + +/* + * The lockdep classes are in a hash-table as well, for fast lookup: + */ +#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) +#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) +#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) +#define classhashentry(key) (classhash_table + __classhashfn((key))) + +static struct list_head classhash_table[CLASSHASH_SIZE]; + +/* + * We put the lock dependency chains into a hash-table as well, to cache + * their existence: + */ +#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) +#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) +#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) +#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) + +static struct list_head chainhash_table[CHAINHASH_SIZE]; + +/* + * The hash key of the lock dependency chains is a hash itself too: + * it's a hash of all locks taken up to that lock, including that lock. + * It's a 64-bit hash, because it's important for the keys to be + * unique. + */ +#define iterate_chain_key(key1, key2) \ + (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ + ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ + (key2)) + +void lockdep_off(void) +{ + current->lockdep_recursion++; +} +EXPORT_SYMBOL(lockdep_off); + +void lockdep_on(void) +{ + current->lockdep_recursion--; +} +EXPORT_SYMBOL(lockdep_on); + +/* + * Debugging switches: + */ + +#define VERBOSE 0 +#define VERY_VERBOSE 0 + +#if VERBOSE +# define HARDIRQ_VERBOSE 1 +# define SOFTIRQ_VERBOSE 1 +# define RECLAIM_VERBOSE 1 +#else +# define HARDIRQ_VERBOSE 0 +# define SOFTIRQ_VERBOSE 0 +# define RECLAIM_VERBOSE 0 +#endif + +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE +/* + * Quick filtering for interesting events: + */ +static int class_filter(struct lock_class *class) +{ +#if 0 + /* Example */ + if (class->name_version == 1 && + !strcmp(class->name, "lockname")) + return 1; + if (class->name_version == 1 && + !strcmp(class->name, "&struct->lockfield")) + return 1; +#endif + /* Filter everything else. 1 would be to allow everything else */ + return 0; +} +#endif + +static int verbose(struct lock_class *class) +{ +#if VERBOSE + return class_filter(class); +#endif + return 0; +} + +/* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the graph_lock. + */ +unsigned long nr_stack_trace_entries; +static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; + +static void print_lockdep_off(const char *bug_msg) +{ + printk(KERN_DEBUG "%s\n", bug_msg); + printk(KERN_DEBUG "turning off the locking correctness validator.\n"); + printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); +} + +static int save_trace(struct stack_trace *trace) +{ + trace->nr_entries = 0; + trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; + trace->entries = stack_trace + nr_stack_trace_entries; + + trace->skip = 3; + + save_stack_trace(trace); + + /* + * Some daft arches put -1 at the end to indicate its a full trace. + * + * this is buggy anyway, since it takes a whole extra entry so a + * complete trace that maxes out the entries provided will be reported + * as incomplete, friggin useless + */ + if (trace->nr_entries != 0 && + trace->entries[trace->nr_entries-1] == ULONG_MAX) + trace->nr_entries--; + + trace->max_entries = trace->nr_entries; + + nr_stack_trace_entries += trace->nr_entries; + + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + if (!debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); + dump_stack(); + + return 0; + } + + return 1; +} + +unsigned int nr_hardirq_chains; +unsigned int nr_softirq_chains; +unsigned int nr_process_chains; +unsigned int max_lockdep_depth; + +#ifdef CONFIG_DEBUG_LOCKDEP +/* + * We cannot printk in early bootup code. Not even early_printk() + * might work. So we mark any initialization errors and printk + * about it later on, in lockdep_info(). + */ +static int lockdep_init_error; +static const char *lock_init_error; +static unsigned long lockdep_init_trace_data[20]; +static struct stack_trace lockdep_init_trace = { + .max_entries = ARRAY_SIZE(lockdep_init_trace_data), + .entries = lockdep_init_trace_data, +}; + +/* + * Various lockdep statistics: + */ +DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); +#endif + +/* + * Locking printouts: + */ + +#define __USAGE(__STATE) \ + [LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \ + [LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \ + [LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\ + [LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R", + +static const char *usage_str[] = +{ +#define LOCKDEP_STATE(__STATE) __USAGE(__STATE) +#include "lockdep_states.h" +#undef LOCKDEP_STATE + [LOCK_USED] = "INITIAL USE", +}; + +const char * __get_key_name(struct lockdep_subclass_key *key, char *str) +{ + return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); +} + +static inline unsigned long lock_flag(enum lock_usage_bit bit) +{ + return 1UL << bit; +} + +static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) +{ + char c = '.'; + + if (class->usage_mask & lock_flag(bit + 2)) + c = '+'; + if (class->usage_mask & lock_flag(bit)) { + c = '-'; + if (class->usage_mask & lock_flag(bit + 2)) + c = '?'; + } + + return c; +} + +void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) +{ + int i = 0; + +#define LOCKDEP_STATE(__STATE) \ + usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \ + usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ); +#include "lockdep_states.h" +#undef LOCKDEP_STATE + + usage[i] = '\0'; +} + +static void __print_lock_name(struct lock_class *class) +{ + char str[KSYM_NAME_LEN]; + const char *name; + + name = class->name; + if (!name) { + name = __get_key_name(class->key, str); + printk("%s", name); + } else { + printk("%s", name); + if (class->name_version > 1) + printk("#%d", class->name_version); + if (class->subclass) + printk("/%d", class->subclass); + } +} + +static void print_lock_name(struct lock_class *class) +{ + char usage[LOCK_USAGE_CHARS]; + + get_usage_chars(class, usage); + + printk(" ("); + __print_lock_name(class); + printk("){%s}", usage); +} + +static void print_lockdep_cache(struct lockdep_map *lock) +{ + const char *name; + char str[KSYM_NAME_LEN]; + + name = lock->name; + if (!name) + name = __get_key_name(lock->key->subkeys, str); + + printk("%s", name); +} + +static void print_lock(struct held_lock *hlock) +{ + print_lock_name(hlock_class(hlock)); + printk(", at: "); + print_ip_sym(hlock->acquire_ip); +} + +static void lockdep_print_held_locks(struct task_struct *curr) +{ + int i, depth = curr->lockdep_depth; + + if (!depth) { + printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); + return; + } + printk("%d lock%s held by %s/%d:\n", + depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr)); + + for (i = 0; i < depth; i++) { + printk(" #%d: ", i); + print_lock(curr->held_locks + i); + } +} + +static void print_kernel_ident(void) +{ + printk("%s %.*s %s\n", init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, + print_tainted()); +} + +static int very_verbose(struct lock_class *class) +{ +#if VERY_VERBOSE + return class_filter(class); +#endif + return 0; +} + +/* + * Is this the address of a static object: + */ +static int static_obj(void *obj) +{ + unsigned long start = (unsigned long) &_stext, + end = (unsigned long) &_end, + addr = (unsigned long) obj; + + /* + * static variable? + */ + if ((addr >= start) && (addr < end)) + return 1; + + if (arch_is_kernel_data(addr)) + return 1; + + /* + * in-kernel percpu var? + */ + if (is_kernel_percpu_address(addr)) + return 1; + + /* + * module static or percpu var? + */ + return is_module_address(addr) || is_module_percpu_address(addr); +} + +/* + * To make lock name printouts unique, we calculate a unique + * class->name_version generation counter: + */ +static int count_matching_names(struct lock_class *new_class) +{ + struct lock_class *class; + int count = 0; + + if (!new_class->name) + return 0; + + list_for_each_entry(class, &all_lock_classes, lock_entry) { + if (new_class->key - new_class->subclass == class->key) + return class->name_version; + if (class->name && !strcmp(class->name, new_class->name)) + count = max(count, class->name_version); + } + + return count + 1; +} + +/* + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. + */ +static inline struct lock_class * +look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) +{ + struct lockdep_subclass_key *key; + struct list_head *hash_head; + struct lock_class *class; + +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * If the architecture calls into lockdep before initializing + * the hashes then we'll warn about it later. (we cannot printk + * right now) + */ + if (unlikely(!lockdep_initialized)) { + lockdep_init(); + lockdep_init_error = 1; + lock_init_error = lock->name; + save_stack_trace(&lockdep_init_trace); + } +#endif + + if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + debug_locks_off(); + printk(KERN_ERR + "BUG: looking up invalid subclass: %u\n", subclass); + printk(KERN_ERR + "turning off the locking correctness validator.\n"); + dump_stack(); + return NULL; + } + + /* + * Static locks do not have their class-keys yet - for them the key + * is the lock object itself: + */ + if (unlikely(!lock->key)) + lock->key = (void *)lock; + + /* + * NOTE: the class-key must be unique. For dynamic locks, a static + * lock_class_key variable is passed in through the mutex_init() + * (or spin_lock_init()) call - which acts as the key. For static + * locks we use the lock object itself as the key. + */ + BUILD_BUG_ON(sizeof(struct lock_class_key) > + sizeof(struct lockdep_map)); + + key = lock->key->subkeys + subclass; + + hash_head = classhashentry(key); + + /* + * We can walk the hash lockfree, because the hash only + * grows, and we are careful when adding entries to the end: + */ + list_for_each_entry(class, hash_head, hash_entry) { + if (class->key == key) { + /* + * Huh! same key, different name? Did someone trample + * on some memory? We're most confused. + */ + WARN_ON_ONCE(class->name != lock->name); + return class; + } + } + + return NULL; +} + +/* + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. + */ +static inline struct lock_class * +register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) +{ + struct lockdep_subclass_key *key; + struct list_head *hash_head; + struct lock_class *class; + unsigned long flags; + + class = look_up_lock_class(lock, subclass); + if (likely(class)) + goto out_set_class_cache; + + /* + * Debug-check: all keys must be persistent! + */ + if (!static_obj(lock->key)) { + debug_locks_off(); + printk("INFO: trying to register non-static key.\n"); + printk("the code is fine but needs lockdep annotation.\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); + + return NULL; + } + + key = lock->key->subkeys + subclass; + hash_head = classhashentry(key); + + raw_local_irq_save(flags); + if (!graph_lock()) { + raw_local_irq_restore(flags); + return NULL; + } + /* + * We have to do the hash-walk again, to avoid races + * with another CPU: + */ + list_for_each_entry(class, hash_head, hash_entry) + if (class->key == key) + goto out_unlock_set; + /* + * Allocate a new key from the static array, and add it to + * the hash: + */ + if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { + if (!debug_locks_off_graph_unlock()) { + raw_local_irq_restore(flags); + return NULL; + } + raw_local_irq_restore(flags); + + print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); + dump_stack(); + return NULL; + } + class = lock_classes + nr_lock_classes++; + debug_atomic_inc(nr_unused_locks); + class->key = key; + class->name = lock->name; + class->subclass = subclass; + INIT_LIST_HEAD(&class->lock_entry); + INIT_LIST_HEAD(&class->locks_before); + INIT_LIST_HEAD(&class->locks_after); + class->name_version = count_matching_names(class); + /* + * We use RCU's safe list-add method to make + * parallel walking of the hash-list safe: + */ + list_add_tail_rcu(&class->hash_entry, hash_head); + /* + * Add it to the global list of classes: + */ + list_add_tail_rcu(&class->lock_entry, &all_lock_classes); + + if (verbose(class)) { + graph_unlock(); + raw_local_irq_restore(flags); + + printk("\nnew class %p: %s", class->key, class->name); + if (class->name_version > 1) + printk("#%d", class->name_version); + printk("\n"); + dump_stack(); + + raw_local_irq_save(flags); + if (!graph_lock()) { + raw_local_irq_restore(flags); + return NULL; + } + } +out_unlock_set: + graph_unlock(); + raw_local_irq_restore(flags); + +out_set_class_cache: + if (!subclass || force) + lock->class_cache[0] = class; + else if (subclass < NR_LOCKDEP_CACHING_CLASSES) + lock->class_cache[subclass] = class; + + /* + * Hash collision, did we smoke some? We found a class with a matching + * hash but the subclass -- which is hashed in -- didn't match. + */ + if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) + return NULL; + + return class; +} + +#ifdef CONFIG_PROVE_LOCKING +/* + * Allocate a lockdep entry. (assumes the graph_lock held, returns + * with NULL on failure) + */ +static struct lock_list *alloc_list_entry(void) +{ + if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { + if (!debug_locks_off_graph_unlock()) + return NULL; + + print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); + dump_stack(); + return NULL; + } + return list_entries + nr_list_entries++; +} + +/* + * Add a new dependency to the head of the list: + */ +static int add_lock_to_list(struct lock_class *class, struct lock_class *this, + struct list_head *head, unsigned long ip, + int distance, struct stack_trace *trace) +{ + struct lock_list *entry; + /* + * Lock not present yet - get a new dependency struct and + * add it to the list: + */ + entry = alloc_list_entry(); + if (!entry) + return 0; + + entry->class = this; + entry->distance = distance; + entry->trace = *trace; + /* + * Since we never remove from the dependency list, the list can + * be walked lockless by other CPUs, it's only allocation + * that must be protected by the spinlock. But this also means + * we must make new entries visible only once writes to the + * entry become visible - hence the RCU op: + */ + list_add_tail_rcu(&entry->entry, head); + + return 1; +} + +/* + * For good efficiency of modular, we use power of 2 + */ +#define MAX_CIRCULAR_QUEUE_SIZE 4096UL +#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) + +/* + * The circular_queue and helpers is used to implement the + * breadth-first search(BFS)algorithem, by which we can build + * the shortest path from the next lock to be acquired to the + * previous held lock if there is a circular between them. + */ +struct circular_queue { + unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; + unsigned int front, rear; +}; + +static struct circular_queue lock_cq; + +unsigned int max_bfs_queue_depth; + +static unsigned int lockdep_dependency_gen_id; + +static inline void __cq_init(struct circular_queue *cq) +{ + cq->front = cq->rear = 0; + lockdep_dependency_gen_id++; +} + +static inline int __cq_empty(struct circular_queue *cq) +{ + return (cq->front == cq->rear); +} + +static inline int __cq_full(struct circular_queue *cq) +{ + return ((cq->rear + 1) & CQ_MASK) == cq->front; +} + +static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +{ + if (__cq_full(cq)) + return -1; + + cq->element[cq->rear] = elem; + cq->rear = (cq->rear + 1) & CQ_MASK; + return 0; +} + +static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +{ + if (__cq_empty(cq)) + return -1; + + *elem = cq->element[cq->front]; + cq->front = (cq->front + 1) & CQ_MASK; + return 0; +} + +static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) +{ + return (cq->rear - cq->front) & CQ_MASK; +} + +static inline void mark_lock_accessed(struct lock_list *lock, + struct lock_list *parent) +{ + unsigned long nr; + + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ + lock->parent = parent; + lock->class->dep_gen_id = lockdep_dependency_gen_id; +} + +static inline unsigned long lock_accessed(struct lock_list *lock) +{ + unsigned long nr; + + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ + return lock->class->dep_gen_id == lockdep_dependency_gen_id; +} + +static inline struct lock_list *get_lock_parent(struct lock_list *child) +{ + return child->parent; +} + +static inline int get_lock_depth(struct lock_list *child) +{ + int depth = 0; + struct lock_list *parent; + + while ((parent = get_lock_parent(child))) { + child = parent; + depth++; + } + return depth; +} + +static int __bfs(struct lock_list *source_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int forward) +{ + struct lock_list *entry; + struct list_head *head; + struct circular_queue *cq = &lock_cq; + int ret = 1; + + if (match(source_entry, data)) { + *target_entry = source_entry; + ret = 0; + goto exit; + } + + if (forward) + head = &source_entry->class->locks_after; + else + head = &source_entry->class->locks_before; + + if (list_empty(head)) + goto exit; + + __cq_init(cq); + __cq_enqueue(cq, (unsigned long)source_entry); + + while (!__cq_empty(cq)) { + struct lock_list *lock; + + __cq_dequeue(cq, (unsigned long *)&lock); + + if (!lock->class) { + ret = -2; + goto exit; + } + + if (forward) + head = &lock->class->locks_after; + else + head = &lock->class->locks_before; + + list_for_each_entry(entry, head, entry) { + if (!lock_accessed(entry)) { + unsigned int cq_depth; + mark_lock_accessed(entry, lock); + if (match(entry, data)) { + *target_entry = entry; + ret = 0; + goto exit; + } + + if (__cq_enqueue(cq, (unsigned long)entry)) { + ret = -1; + goto exit; + } + cq_depth = __cq_get_elem_count(cq); + if (max_bfs_queue_depth < cq_depth) + max_bfs_queue_depth = cq_depth; + } + } + } +exit: + return ret; +} + +static inline int __bfs_forwards(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) +{ + return __bfs(src_entry, data, match, target_entry, 1); + +} + +static inline int __bfs_backwards(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) +{ + return __bfs(src_entry, data, match, target_entry, 0); + +} + +/* + * Recursive, forwards-direction lock-dependency checking, used for + * both noncyclic checking and for hardirq-unsafe/softirq-unsafe + * checking. + */ + +/* + * Print a dependency chain entry (this is only done when a deadlock + * has been detected): + */ +static noinline int +print_circular_bug_entry(struct lock_list *target, int depth) +{ + if (debug_locks_silent) + return 0; + printk("\n-> #%u", depth); + print_lock_name(target->class); + printk(":\n"); + print_stack_trace(&target->trace, 6); + + return 0; +} + +static void +print_circular_lock_scenario(struct held_lock *src, + struct held_lock *tgt, + struct lock_list *prt) +{ + struct lock_class *source = hlock_class(src); + struct lock_class *target = hlock_class(tgt); + struct lock_class *parent = prt->class; + + /* + * A direct locking problem where unsafe_class lock is taken + * directly by safe_class lock, then all we need to show + * is the deadlock scenario, as it is obvious that the + * unsafe lock is taken under the safe lock. + * + * But if there is a chain instead, where the safe lock takes + * an intermediate lock (middle_class) where this lock is + * not the same as the safe lock, then the lock chain is + * used to describe the problem. Otherwise we would need + * to show a different CPU case for each link in the chain + * from the safe_class lock to the unsafe_class lock. + */ + if (parent != source) { + printk("Chain exists of:\n "); + __print_lock_name(source); + printk(" --> "); + __print_lock_name(parent); + printk(" --> "); + __print_lock_name(target); + printk("\n\n"); + } + + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(");\n"); + printk(" lock("); + __print_lock_name(target); + printk(");\n"); + printk(" lock("); + __print_lock_name(source); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); +} + +/* + * When a circular dependency is detected, print the + * header first: + */ +static noinline int +print_circular_bug_header(struct lock_list *entry, unsigned int depth, + struct held_lock *check_src, + struct held_lock *check_tgt) +{ + struct task_struct *curr = current; + + if (debug_locks_silent) + return 0; + + printk("\n"); + printk("======================================================\n"); + printk("[ INFO: possible circular locking dependency detected ]\n"); + print_kernel_ident(); + printk("-------------------------------------------------------\n"); + printk("%s/%d is trying to acquire lock:\n", + curr->comm, task_pid_nr(curr)); + print_lock(check_src); + printk("\nbut task is already holding lock:\n"); + print_lock(check_tgt); + printk("\nwhich lock already depends on the new lock.\n\n"); + printk("\nthe existing dependency chain (in reverse order) is:\n"); + + print_circular_bug_entry(entry, depth); + + return 0; +} + +static inline int class_equal(struct lock_list *entry, void *data) +{ + return entry->class == data; +} + +static noinline int print_circular_bug(struct lock_list *this, + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) +{ + struct task_struct *curr = current; + struct lock_list *parent; + struct lock_list *first_parent; + int depth; + + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + if (!save_trace(&this->trace)) + return 0; + + depth = get_lock_depth(target); + + print_circular_bug_header(target, depth, check_src, check_tgt); + + parent = get_lock_parent(target); + first_parent = parent; + + while (parent) { + print_circular_bug_entry(parent, --depth); + parent = get_lock_parent(parent); + } + + printk("\nother info that might help us debug this:\n\n"); + print_circular_lock_scenario(check_src, check_tgt, + first_parent); + + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static noinline int print_bfs_bug(int ret) +{ + if (!debug_locks_off_graph_unlock()) + return 0; + + /* + * Breadth-first-search failed, graph got corrupted? + */ + WARN(1, "lockdep bfs error:%d\n", ret); + + return 0; +} + +static int noop_count(struct lock_list *entry, void *data) +{ + (*(unsigned long *)data)++; + return 0; +} + +unsigned long __lockdep_count_forward_deps(struct lock_list *this) +{ + unsigned long count = 0; + struct lock_list *uninitialized_var(target_entry); + + __bfs_forwards(this, (void *)&count, noop_count, &target_entry); + + return count; +} +unsigned long lockdep_count_forward_deps(struct lock_class *class) +{ + unsigned long ret, flags; + struct lock_list this; + + this.parent = NULL; + this.class = class; + + local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + ret = __lockdep_count_forward_deps(&this); + arch_spin_unlock(&lockdep_lock); + local_irq_restore(flags); + + return ret; +} + +unsigned long __lockdep_count_backward_deps(struct lock_list *this) +{ + unsigned long count = 0; + struct lock_list *uninitialized_var(target_entry); + + __bfs_backwards(this, (void *)&count, noop_count, &target_entry); + + return count; +} + +unsigned long lockdep_count_backward_deps(struct lock_class *class) +{ + unsigned long ret, flags; + struct lock_list this; + + this.parent = NULL; + this.class = class; + + local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + ret = __lockdep_count_backward_deps(&this); + arch_spin_unlock(&lockdep_lock); + local_irq_restore(flags); + + return ret; +} + +/* + * Prove that the dependency graph starting at can not + * lead to . Print an error and return 0 if it does. + */ +static noinline int +check_noncircular(struct lock_list *root, struct lock_class *target, + struct lock_list **target_entry) +{ + int result; + + debug_atomic_inc(nr_cyclic_checks); + + result = __bfs_forwards(root, target, class_equal, target_entry); + + return result; +} + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +/* + * Forwards and backwards subgraph searching, for the purposes of + * proving that two subgraphs can be connected by a new dependency + * without creating any illegal irq-safe -> irq-unsafe lock dependency. + */ + +static inline int usage_match(struct lock_list *entry, void *bit) +{ + return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); +} + + + +/* + * Find a node in the forwards-direction dependency sub-graph starting + * at @root->class that matches @bit. + * + * Return 0 if such a node exists in the subgraph, and put that node + * into *@target_entry. + * + * Return 1 otherwise and keep *@target_entry unchanged. + * Return <0 on error. + */ +static int +find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, + struct lock_list **target_entry) +{ + int result; + + debug_atomic_inc(nr_find_usage_forwards_checks); + + result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); + + return result; +} + +/* + * Find a node in the backwards-direction dependency sub-graph starting + * at @root->class that matches @bit. + * + * Return 0 if such a node exists in the subgraph, and put that node + * into *@target_entry. + * + * Return 1 otherwise and keep *@target_entry unchanged. + * Return <0 on error. + */ +static int +find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, + struct lock_list **target_entry) +{ + int result; + + debug_atomic_inc(nr_find_usage_backwards_checks); + + result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); + + return result; +} + +static void print_lock_class_header(struct lock_class *class, int depth) +{ + int bit; + + printk("%*s->", depth, ""); + print_lock_name(class); + printk(" ops: %lu", class->ops); + printk(" {\n"); + + for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { + if (class->usage_mask & (1 << bit)) { + int len = depth; + + len += printk("%*s %s", depth, "", usage_str[bit]); + len += printk(" at:\n"); + print_stack_trace(class->usage_traces + bit, len); + } + } + printk("%*s }\n", depth, ""); + + printk("%*s ... key at: ",depth,""); + print_ip_sym((unsigned long)class->key); +} + +/* + * printk the shortest lock dependencies from @start to @end in reverse order: + */ +static void __used +print_shortest_lock_dependencies(struct lock_list *leaf, + struct lock_list *root) +{ + struct lock_list *entry = leaf; + int depth; + + /*compute depth from generated tree by BFS*/ + depth = get_lock_depth(leaf); + + do { + print_lock_class_header(entry->class, depth); + printk("%*s ... acquired at:\n", depth, ""); + print_stack_trace(&entry->trace, 2); + printk("\n"); + + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad path found in chain graph\n", __func__); + break; + } + + entry = get_lock_parent(entry); + depth--; + } while (entry && (depth >= 0)); + + return; +} + +static void +print_irq_lock_scenario(struct lock_list *safe_entry, + struct lock_list *unsafe_entry, + struct lock_class *prev_class, + struct lock_class *next_class) +{ + struct lock_class *safe_class = safe_entry->class; + struct lock_class *unsafe_class = unsafe_entry->class; + struct lock_class *middle_class = prev_class; + + if (middle_class == safe_class) + middle_class = next_class; + + /* + * A direct locking problem where unsafe_class lock is taken + * directly by safe_class lock, then all we need to show + * is the deadlock scenario, as it is obvious that the + * unsafe lock is taken under the safe lock. + * + * But if there is a chain instead, where the safe lock takes + * an intermediate lock (middle_class) where this lock is + * not the same as the safe lock, then the lock chain is + * used to describe the problem. Otherwise we would need + * to show a different CPU case for each link in the chain + * from the safe_class lock to the unsafe_class lock. + */ + if (middle_class != unsafe_class) { + printk("Chain exists of:\n "); + __print_lock_name(safe_class); + printk(" --> "); + __print_lock_name(middle_class); + printk(" --> "); + __print_lock_name(unsafe_class); + printk("\n\n"); + } + + printk(" Possible interrupt unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(unsafe_class); + printk(");\n"); + printk(" local_irq_disable();\n"); + printk(" lock("); + __print_lock_name(safe_class); + printk(");\n"); + printk(" lock("); + __print_lock_name(middle_class); + printk(");\n"); + printk(" \n"); + printk(" lock("); + __print_lock_name(safe_class); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); +} + +static int +print_bad_irq_dependency(struct task_struct *curr, + struct lock_list *prev_root, + struct lock_list *next_root, + struct lock_list *backwards_entry, + struct lock_list *forwards_entry, + struct held_lock *prev, + struct held_lock *next, + enum lock_usage_bit bit1, + enum lock_usage_bit bit2, + const char *irqclass) +{ + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + printk("\n"); + printk("======================================================\n"); + printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", + irqclass, irqclass); + print_kernel_ident(); + printk("------------------------------------------------------\n"); + printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", + curr->comm, task_pid_nr(curr), + curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, + curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, + curr->hardirqs_enabled, + curr->softirqs_enabled); + print_lock(next); + + printk("\nand this task is already holding:\n"); + print_lock(prev); + printk("which would create a new lock dependency:\n"); + print_lock_name(hlock_class(prev)); + printk(" ->"); + print_lock_name(hlock_class(next)); + printk("\n"); + + printk("\nbut this new dependency connects a %s-irq-safe lock:\n", + irqclass); + print_lock_name(backwards_entry->class); + printk("\n... which became %s-irq-safe at:\n", irqclass); + + print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); + + printk("\nto a %s-irq-unsafe lock:\n", irqclass); + print_lock_name(forwards_entry->class); + printk("\n... which became %s-irq-unsafe at:\n", irqclass); + printk("..."); + + print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); + + printk("\nother info that might help us debug this:\n\n"); + print_irq_lock_scenario(backwards_entry, forwards_entry, + hlock_class(prev), hlock_class(next)); + + lockdep_print_held_locks(curr); + + printk("\nthe dependencies between %s-irq-safe lock", irqclass); + printk(" and the holding lock:\n"); + if (!save_trace(&prev_root->trace)) + return 0; + print_shortest_lock_dependencies(backwards_entry, prev_root); + + printk("\nthe dependencies between the lock to be acquired"); + printk(" and %s-irq-unsafe lock:\n", irqclass); + if (!save_trace(&next_root->trace)) + return 0; + print_shortest_lock_dependencies(forwards_entry, next_root); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static int +check_usage(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next, enum lock_usage_bit bit_backwards, + enum lock_usage_bit bit_forwards, const char *irqclass) +{ + int ret; + struct lock_list this, that; + struct lock_list *uninitialized_var(target_entry); + struct lock_list *uninitialized_var(target_entry1); + + this.parent = NULL; + + this.class = hlock_class(prev); + ret = find_usage_backwards(&this, bit_backwards, &target_entry); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; + + that.parent = NULL; + that.class = hlock_class(next); + ret = find_usage_forwards(&that, bit_forwards, &target_entry1); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; + + return print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, + bit_backwards, bit_forwards, irqclass); +} + +static const char *state_names[] = { +#define LOCKDEP_STATE(__STATE) \ + __stringify(__STATE), +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static const char *state_rnames[] = { +#define LOCKDEP_STATE(__STATE) \ + __stringify(__STATE)"-READ", +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static inline const char *state_name(enum lock_usage_bit bit) +{ + return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; +} + +static int exclusive_bit(int new_bit) +{ + /* + * USED_IN + * USED_IN_READ + * ENABLED + * ENABLED_READ + * + * bit 0 - write/read + * bit 1 - used_in/enabled + * bit 2+ state + */ + + int state = new_bit & ~3; + int dir = new_bit & 2; + + /* + * keep state, bit flip the direction and strip read. + */ + return state | (dir ^ 2); +} + +static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next, enum lock_usage_bit bit) +{ + /* + * Prove that the new dependency does not connect a hardirq-safe + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ + if (!check_usage(curr, prev, next, bit, + exclusive_bit(bit), state_name(bit))) + return 0; + + bit++; /* _READ */ + + /* + * Prove that the new dependency does not connect a hardirq-safe-read + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ + if (!check_usage(curr, prev, next, bit, + exclusive_bit(bit), state_name(bit))) + return 0; + + return 1; +} + +static int +check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ +#define LOCKDEP_STATE(__STATE) \ + if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ + return 0; +#include "lockdep_states.h" +#undef LOCKDEP_STATE + + return 1; +} + +static void inc_chains(void) +{ + if (current->hardirq_context) + nr_hardirq_chains++; + else { + if (current->softirq_context) + nr_softirq_chains++; + else + nr_process_chains++; + } +} + +#else + +static inline int +check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ + return 1; +} + +static inline void inc_chains(void) +{ + nr_process_chains++; +} + +#endif + +static void +print_deadlock_scenario(struct held_lock *nxt, + struct held_lock *prv) +{ + struct lock_class *next = hlock_class(nxt); + struct lock_class *prev = hlock_class(prv); + + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0\n"); + printk(" ----\n"); + printk(" lock("); + __print_lock_name(prev); + printk(");\n"); + printk(" lock("); + __print_lock_name(next); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); + printk(" May be due to missing lock nesting notation\n\n"); +} + +static int +print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + printk("\n"); + printk("=============================================\n"); + printk("[ INFO: possible recursive locking detected ]\n"); + print_kernel_ident(); + printk("---------------------------------------------\n"); + printk("%s/%d is trying to acquire lock:\n", + curr->comm, task_pid_nr(curr)); + print_lock(next); + printk("\nbut task is already holding lock:\n"); + print_lock(prev); + + printk("\nother info that might help us debug this:\n"); + print_deadlock_scenario(next, prev); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Check whether we are holding such a class already. + * + * (Note that this has to be done separately, because the graph cannot + * detect such classes of deadlocks.) + * + * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read + */ +static int +check_deadlock(struct task_struct *curr, struct held_lock *next, + struct lockdep_map *next_instance, int read) +{ + struct held_lock *prev; + struct held_lock *nest = NULL; + int i; + + for (i = 0; i < curr->lockdep_depth; i++) { + prev = curr->held_locks + i; + + if (prev->instance == next->nest_lock) + nest = prev; + + if (hlock_class(prev) != hlock_class(next)) + continue; + + /* + * Allow read-after-read recursion of the same + * lock class (i.e. read_lock(lock)+read_lock(lock)): + */ + if ((read == 2) && prev->read) + return 2; + + /* + * We're holding the nest_lock, which serializes this lock's + * nesting behaviour. + */ + if (nest) + return 2; + + return print_deadlock_bug(curr, prev, next); + } + return 1; +} + +/* + * There was a chain-cache miss, and we are about to add a new dependency + * to a previous lock. We recursively validate the following rules: + * + * - would the adding of the -> dependency create a + * circular dependency in the graph? [== circular deadlock] + * + * - does the new prev->next dependency connect any hardirq-safe lock + * (in the full backwards-subgraph starting at ) with any + * hardirq-unsafe lock (in the full forwards-subgraph starting at + * )? [== illegal lock inversion with hardirq contexts] + * + * - does the new prev->next dependency connect any softirq-safe lock + * (in the full backwards-subgraph starting at ) with any + * softirq-unsafe lock (in the full forwards-subgraph starting at + * )? [== illegal lock inversion with softirq contexts] + * + * any of these scenarios could lead to a deadlock. + * + * Then if all the validations pass, we add the forwards and backwards + * dependency. + */ +static int +check_prev_add(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next, int distance, int trylock_loop) +{ + struct lock_list *entry; + int ret; + struct lock_list this; + struct lock_list *uninitialized_var(target_entry); + /* + * Static variable, serialized by the graph_lock(). + * + * We use this static variable to save the stack trace in case + * we call into this function multiple times due to encountering + * trylocks in the held lock stack. + */ + static struct stack_trace trace; + + /* + * Prove that the new -> dependency would not + * create a circular dependency in the graph. (We do this by + * forward-recursing into the graph starting at , and + * checking whether we can reach .) + * + * We are using global variables to control the recursion, to + * keep the stackframe size of the recursive functions low: + */ + this.class = hlock_class(next); + this.parent = NULL; + ret = check_noncircular(&this, hlock_class(prev), &target_entry); + if (unlikely(!ret)) + return print_circular_bug(&this, target_entry, next, prev); + else if (unlikely(ret < 0)) + return print_bfs_bug(ret); + + if (!check_prev_add_irq(curr, prev, next)) + return 0; + + /* + * For recursive read-locks we do all the dependency checks, + * but we dont store read-triggered dependencies (only + * write-triggered dependencies). This ensures that only the + * write-side dependencies matter, and that if for example a + * write-lock never takes any other locks, then the reads are + * equivalent to a NOP. + */ + if (next->read == 2 || prev->read == 2) + return 1; + /* + * Is the -> dependency already present? + * + * (this may occur even though this is a new chain: consider + * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3 + * chains - the second one will be new, but L1 already has + * L2 added to its dependency list, due to the first chain.) + */ + list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) { + if (entry->class == hlock_class(next)) { + if (distance == 1) + entry->distance = 1; + return 2; + } + } + + if (!trylock_loop && !save_trace(&trace)) + return 0; + + /* + * Ok, all validations passed, add the new lock + * to the previous lock's dependency list: + */ + ret = add_lock_to_list(hlock_class(prev), hlock_class(next), + &hlock_class(prev)->locks_after, + next->acquire_ip, distance, &trace); + + if (!ret) + return 0; + + ret = add_lock_to_list(hlock_class(next), hlock_class(prev), + &hlock_class(next)->locks_before, + next->acquire_ip, distance, &trace); + if (!ret) + return 0; + + /* + * Debugging printouts: + */ + if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { + graph_unlock(); + printk("\n new dependency: "); + print_lock_name(hlock_class(prev)); + printk(" => "); + print_lock_name(hlock_class(next)); + printk("\n"); + dump_stack(); + return graph_lock(); + } + return 1; +} + +/* + * Add the dependency to all directly-previous locks that are 'relevant'. + * The ones that are relevant are (in increasing distance from curr): + * all consecutive trylock entries and the final non-trylock entry - or + * the end of this context's lock-chain - whichever comes first. + */ +static int +check_prevs_add(struct task_struct *curr, struct held_lock *next) +{ + int depth = curr->lockdep_depth; + int trylock_loop = 0; + struct held_lock *hlock; + + /* + * Debugging checks. + * + * Depth must not be zero for a non-head lock: + */ + if (!depth) + goto out_bug; + /* + * At least two relevant locks must exist for this + * to be a head: + */ + if (curr->held_locks[depth].irq_context != + curr->held_locks[depth-1].irq_context) + goto out_bug; + + for (;;) { + int distance = curr->lockdep_depth - depth + 1; + hlock = curr->held_locks + depth-1; + /* + * Only non-recursive-read entries get new dependencies + * added: + */ + if (hlock->read != 2) { + if (!check_prev_add(curr, hlock, next, + distance, trylock_loop)) + return 0; + /* + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: + */ + if (!hlock->trylock) + break; + } + depth--; + /* + * End of lock-stack? + */ + if (!depth) + break; + /* + * Stop the search if we cross into another context: + */ + if (curr->held_locks[depth].irq_context != + curr->held_locks[depth-1].irq_context) + break; + trylock_loop = 1; + } + return 1; +out_bug: + if (!debug_locks_off_graph_unlock()) + return 0; + + /* + * Clearly we all shouldn't be here, but since we made it we + * can reliable say we messed up our state. See the above two + * gotos for reasons why we could possibly end up here. + */ + WARN_ON(1); + + return 0; +} + +unsigned long nr_lock_chains; +struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +int nr_chain_hlocks; +static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; + +struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) +{ + return lock_classes + chain_hlocks[chain->base + i]; +} + +/* + * Look up a dependency chain. If the key is not present yet then + * add it and return 1 - in this case the new dependency chain is + * validated. If the key is already hashed, return 0. + * (On return with 1 graph_lock is held.) + */ +static inline int lookup_chain_cache(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) +{ + struct lock_class *class = hlock_class(hlock); + struct list_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; + struct held_lock *hlock_curr; + int i, j; + + /* + * We might need to take the graph lock, ensure we've got IRQs + * disabled to make this an IRQ-safe lock.. for recursion reasons + * lockdep won't complain about its own locking errors. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + /* + * We can walk it lock-free, because entries only get added + * to the hash: + */ + list_for_each_entry(chain, hash_head, entry) { + if (chain->chain_key == chain_key) { +cache_hit: + debug_atomic_inc(chain_lookup_hits); + if (very_verbose(class)) + printk("\nhash chain already cached, key: " + "%016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, + class->key, class->name); + return 0; + } + } + if (very_verbose(class)) + printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, class->key, class->name); + /* + * Allocate a new chain entry from the static array, and add + * it to the hash: + */ + if (!graph_lock()) + return 0; + /* + * We have to walk the chain again locked - to avoid duplicates: + */ + list_for_each_entry(chain, hash_head, entry) { + if (chain->chain_key == chain_key) { + graph_unlock(); + goto cache_hit; + } + } + if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { + if (!debug_locks_off_graph_unlock()) + return 0; + + print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); + dump_stack(); + return 0; + } + chain = lock_chains + nr_lock_chains++; + chain->chain_key = chain_key; + chain->irq_context = hlock->irq_context; + /* Find the first held_lock of current chain */ + for (i = curr->lockdep_depth - 1; i >= 0; i--) { + hlock_curr = curr->held_locks + i; + if (hlock_curr->irq_context != hlock->irq_context) + break; + } + i++; + chain->depth = curr->lockdep_depth + 1 - i; + if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { + chain->base = nr_chain_hlocks; + nr_chain_hlocks += chain->depth; + for (j = 0; j < chain->depth - 1; j++, i++) { + int lock_id = curr->held_locks[i].class_idx - 1; + chain_hlocks[chain->base + j] = lock_id; + } + chain_hlocks[chain->base + j] = class - lock_classes; + } + list_add_tail_rcu(&chain->entry, hash_head); + debug_atomic_inc(chain_lookup_misses); + inc_chains(); + + return 1; +} + +static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, + struct held_lock *hlock, int chain_head, u64 chain_key) +{ + /* + * Trylock needs to maintain the stack of held locks, but it + * does not add new dependencies, because trylock can be done + * in any order. + * + * We look up the chain_key and do the O(N^2) check and update of + * the dependencies only if this is a new dependency chain. + * (If lookup_chain_cache() returns with 1 it acquires + * graph_lock for us) + */ + if (!hlock->trylock && (hlock->check == 2) && + lookup_chain_cache(curr, hlock, chain_key)) { + /* + * Check whether last held lock: + * + * - is irq-safe, if this lock is irq-unsafe + * - is softirq-safe, if this lock is hardirq-unsafe + * + * And check whether the new lock's dependency graph + * could lead back to the previous lock. + * + * any of these scenarios could lead to a deadlock. If + * All validations + */ + int ret = check_deadlock(curr, hlock, lock, hlock->read); + + if (!ret) + return 0; + /* + * Mark recursive read, as we jump over it when + * building dependencies (just like we jump over + * trylock entries): + */ + if (ret == 2) + hlock->read = 2; + /* + * Add dependency only if this lock is not the head + * of the chain, and if it's not a secondary read-lock: + */ + if (!chain_head && ret != 2) + if (!check_prevs_add(curr, hlock)) + return 0; + graph_unlock(); + } else + /* after lookup_chain_cache(): */ + if (unlikely(!debug_locks)) + return 0; + + return 1; +} +#else +static inline int validate_chain(struct task_struct *curr, + struct lockdep_map *lock, struct held_lock *hlock, + int chain_head, u64 chain_key) +{ + return 1; +} +#endif + +/* + * We are building curr_chain_key incrementally, so double-check + * it from scratch, to make sure that it's done correctly: + */ +static void check_chain_key(struct task_struct *curr) +{ +#ifdef CONFIG_DEBUG_LOCKDEP + struct held_lock *hlock, *prev_hlock = NULL; + unsigned int i, id; + u64 chain_key = 0; + + for (i = 0; i < curr->lockdep_depth; i++) { + hlock = curr->held_locks + i; + if (chain_key != hlock->prev_chain_key) { + debug_locks_off(); + /* + * We got mighty confused, our chain keys don't match + * with what we expect, someone trample on our task state? + */ + WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", + curr->lockdep_depth, i, + (unsigned long long)chain_key, + (unsigned long long)hlock->prev_chain_key); + return; + } + id = hlock->class_idx - 1; + /* + * Whoops ran out of static storage again? + */ + if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + return; + + if (prev_hlock && (prev_hlock->irq_context != + hlock->irq_context)) + chain_key = 0; + chain_key = iterate_chain_key(chain_key, id); + prev_hlock = hlock; + } + if (chain_key != curr->curr_chain_key) { + debug_locks_off(); + /* + * More smoking hash instead of calculating it, damn see these + * numbers float.. I bet that a pink elephant stepped on my memory. + */ + WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", + curr->lockdep_depth, i, + (unsigned long long)chain_key, + (unsigned long long)curr->curr_chain_key); + } +#endif +} + +static void +print_usage_bug_scenario(struct held_lock *lock) +{ + struct lock_class *class = hlock_class(lock); + + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0\n"); + printk(" ----\n"); + printk(" lock("); + __print_lock_name(class); + printk(");\n"); + printk(" \n"); + printk(" lock("); + __print_lock_name(class); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); +} + +static int +print_usage_bug(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) +{ + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + printk("\n"); + printk("=================================\n"); + printk("[ INFO: inconsistent lock state ]\n"); + print_kernel_ident(); + printk("---------------------------------\n"); + + printk("inconsistent {%s} -> {%s} usage.\n", + usage_str[prev_bit], usage_str[new_bit]); + + printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", + curr->comm, task_pid_nr(curr), + trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, + trace_hardirqs_enabled(curr), + trace_softirqs_enabled(curr)); + print_lock(this); + + printk("{%s} state was registered at:\n", usage_str[prev_bit]); + print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); + + print_irqtrace_events(curr); + printk("\nother info that might help us debug this:\n"); + print_usage_bug_scenario(this); + + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Print out an error if an invalid bit is set: + */ +static inline int +valid_state(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) +{ + if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) + return print_usage_bug(curr, this, bad_bit, new_bit); + return 1; +} + +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit); + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + +/* + * print irq inversion bug: + */ +static int +print_irq_inversion_bug(struct task_struct *curr, + struct lock_list *root, struct lock_list *other, + struct held_lock *this, int forwards, + const char *irqclass) +{ + struct lock_list *entry = other; + struct lock_list *middle = NULL; + int depth; + + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + printk("\n"); + printk("=========================================================\n"); + printk("[ INFO: possible irq lock inversion dependency detected ]\n"); + print_kernel_ident(); + printk("---------------------------------------------------------\n"); + printk("%s/%d just changed the state of lock:\n", + curr->comm, task_pid_nr(curr)); + print_lock(this); + if (forwards) + printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); + else + printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); + print_lock_name(other->class); + printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); + + printk("\nother info that might help us debug this:\n"); + + /* Find a middle lock (if one exists) */ + depth = get_lock_depth(other); + do { + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad path found in chain graph\n", __func__); + break; + } + middle = entry; + entry = get_lock_parent(entry); + depth--; + } while (entry && entry != root && (depth >= 0)); + if (forwards) + print_irq_lock_scenario(root, other, + middle ? middle->class : root->class, other->class); + else + print_irq_lock_scenario(other, root, + middle ? middle->class : other->class, root->class); + + lockdep_print_held_locks(curr); + + printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); + if (!save_trace(&root->trace)) + return 0; + print_shortest_lock_dependencies(other, root); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Prove that in the forwards-direction subgraph starting at + * there is no lock matching : + */ +static int +check_usage_forwards(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit bit, const char *irqclass) +{ + int ret; + struct lock_list root; + struct lock_list *uninitialized_var(target_entry); + + root.parent = NULL; + root.class = hlock_class(this); + ret = find_usage_forwards(&root, bit, &target_entry); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; + + return print_irq_inversion_bug(curr, &root, target_entry, + this, 1, irqclass); +} + +/* + * Prove that in the backwards-direction subgraph starting at + * there is no lock matching : + */ +static int +check_usage_backwards(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit bit, const char *irqclass) +{ + int ret; + struct lock_list root; + struct lock_list *uninitialized_var(target_entry); + + root.parent = NULL; + root.class = hlock_class(this); + ret = find_usage_backwards(&root, bit, &target_entry); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; + + return print_irq_inversion_bug(curr, &root, target_entry, + this, 0, irqclass); +} + +void print_irqtrace_events(struct task_struct *curr) +{ + printk("irq event stamp: %u\n", curr->irq_events); + printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); + print_ip_sym(curr->hardirq_enable_ip); + printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); + print_ip_sym(curr->hardirq_disable_ip); + printk("softirqs last enabled at (%u): ", curr->softirq_enable_event); + print_ip_sym(curr->softirq_enable_ip); + printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); + print_ip_sym(curr->softirq_disable_ip); +} + +static int HARDIRQ_verbose(struct lock_class *class) +{ +#if HARDIRQ_VERBOSE + return class_filter(class); +#endif + return 0; +} + +static int SOFTIRQ_verbose(struct lock_class *class) +{ +#if SOFTIRQ_VERBOSE + return class_filter(class); +#endif + return 0; +} + +static int RECLAIM_FS_verbose(struct lock_class *class) +{ +#if RECLAIM_VERBOSE + return class_filter(class); +#endif + return 0; +} + +#define STRICT_READ_CHECKS 1 + +static int (*state_verbose_f[])(struct lock_class *class) = { +#define LOCKDEP_STATE(__STATE) \ + __STATE##_verbose, +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static inline int state_verbose(enum lock_usage_bit bit, + struct lock_class *class) +{ + return state_verbose_f[bit >> 2](class); +} + +typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, + enum lock_usage_bit bit, const char *name); + +static int +mark_lock_irq(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) +{ + int excl_bit = exclusive_bit(new_bit); + int read = new_bit & 1; + int dir = new_bit & 2; + + /* + * mark USED_IN has to look forwards -- to ensure no dependency + * has ENABLED state, which would allow recursion deadlocks. + * + * mark ENABLED has to look backwards -- to ensure no dependee + * has USED_IN state, which, again, would allow recursion deadlocks. + */ + check_usage_f usage = dir ? + check_usage_backwards : check_usage_forwards; + + /* + * Validate that this particular lock does not have conflicting + * usage states. + */ + if (!valid_state(curr, this, new_bit, excl_bit)) + return 0; + + /* + * Validate that the lock dependencies don't have conflicting usage + * states. + */ + if ((!read || !dir || STRICT_READ_CHECKS) && + !usage(curr, this, excl_bit, state_name(new_bit & ~1))) + return 0; + + /* + * Check for read in write conflicts + */ + if (!read) { + if (!valid_state(curr, this, new_bit, excl_bit + 1)) + return 0; + + if (STRICT_READ_CHECKS && + !usage(curr, this, excl_bit + 1, + state_name(new_bit + 1))) + return 0; + } + + if (state_verbose(new_bit, hlock_class(this))) + return 2; + + return 1; +} + +enum mark_type { +#define LOCKDEP_STATE(__STATE) __STATE, +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +/* + * Mark all held locks with a usage bit: + */ +static int +mark_held_locks(struct task_struct *curr, enum mark_type mark) +{ + enum lock_usage_bit usage_bit; + struct held_lock *hlock; + int i; + + for (i = 0; i < curr->lockdep_depth; i++) { + hlock = curr->held_locks + i; + + usage_bit = 2 + (mark << 2); /* ENABLED */ + if (hlock->read) + usage_bit += 1; /* READ */ + + BUG_ON(usage_bit >= LOCK_USAGE_STATES); + + if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) + continue; + + if (!mark_lock(curr, hlock, usage_bit)) + return 0; + } + + return 1; +} + +/* + * Hardirqs will be enabled: + */ +static void __trace_hardirqs_on_caller(unsigned long ip) +{ + struct task_struct *curr = current; + + /* we'll do an OFF -> ON transition: */ + curr->hardirqs_enabled = 1; + + /* + * We are going to turn hardirqs on, so set the + * usage bit for all held locks: + */ + if (!mark_held_locks(curr, HARDIRQ)) + return; + /* + * If we have softirqs enabled, then set the usage + * bit for all held locks. (disabled hardirqs prevented + * this bit from being set before) + */ + if (curr->softirqs_enabled) + if (!mark_held_locks(curr, SOFTIRQ)) + return; + + curr->hardirq_enable_ip = ip; + curr->hardirq_enable_event = ++curr->irq_events; + debug_atomic_inc(hardirqs_on_events); +} + +void trace_hardirqs_on_caller(unsigned long ip) +{ + time_hardirqs_on(CALLER_ADDR0, ip); + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + if (unlikely(current->hardirqs_enabled)) { + /* + * Neither irq nor preemption are disabled here + * so this is racy by nature but losing one hit + * in a stat is not a big deal. + */ + __debug_atomic_inc(redundant_hardirqs_on); + return; + } + + /* + * We're enabling irqs and according to our state above irqs weren't + * already enabled, yet we find the hardware thinks they are in fact + * enabled.. someone messed up their IRQ state tracing. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + /* + * See the fine text that goes along with this variable definition. + */ + if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) + return; + + /* + * Can't allow enabling interrupts while in an interrupt handler, + * that's general bad form and such. Recursion, limited stack etc.. + */ + if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + return; + + current->lockdep_recursion = 1; + __trace_hardirqs_on_caller(ip); + current->lockdep_recursion = 0; +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +void trace_hardirqs_on(void) +{ + trace_hardirqs_on_caller(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +/* + * Hardirqs were disabled: + */ +void trace_hardirqs_off_caller(unsigned long ip) +{ + struct task_struct *curr = current; + + time_hardirqs_off(CALLER_ADDR0, ip); + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + /* + * So we're supposed to get called after you mask local IRQs, but for + * some reason the hardware doesn't quite think you did a proper job. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (curr->hardirqs_enabled) { + /* + * We have done an ON -> OFF transition: + */ + curr->hardirqs_enabled = 0; + curr->hardirq_disable_ip = ip; + curr->hardirq_disable_event = ++curr->irq_events; + debug_atomic_inc(hardirqs_off_events); + } else + debug_atomic_inc(redundant_hardirqs_off); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +void trace_hardirqs_off(void) +{ + trace_hardirqs_off_caller(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +/* + * Softirqs will be enabled: + */ +void trace_softirqs_on(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + /* + * We fancy IRQs being disabled here, see softirq.c, avoids + * funny state and nesting things. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (curr->softirqs_enabled) { + debug_atomic_inc(redundant_softirqs_on); + return; + } + + current->lockdep_recursion = 1; + /* + * We'll do an OFF -> ON transition: + */ + curr->softirqs_enabled = 1; + curr->softirq_enable_ip = ip; + curr->softirq_enable_event = ++curr->irq_events; + debug_atomic_inc(softirqs_on_events); + /* + * We are going to turn softirqs on, so set the + * usage bit for all held locks, if hardirqs are + * enabled too: + */ + if (curr->hardirqs_enabled) + mark_held_locks(curr, SOFTIRQ); + current->lockdep_recursion = 0; +} + +/* + * Softirqs were disabled: + */ +void trace_softirqs_off(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + /* + * We fancy IRQs being disabled here, see softirq.c + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (curr->softirqs_enabled) { + /* + * We have done an ON -> OFF transition: + */ + curr->softirqs_enabled = 0; + curr->softirq_disable_ip = ip; + curr->softirq_disable_event = ++curr->irq_events; + debug_atomic_inc(softirqs_off_events); + /* + * Whoops, we wanted softirqs off, so why aren't they? + */ + DEBUG_LOCKS_WARN_ON(!softirq_count()); + } else + debug_atomic_inc(redundant_softirqs_off); +} + +static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks)) + return; + + /* no reclaim without waiting on it */ + if (!(gfp_mask & __GFP_WAIT)) + return; + + /* this guy won't enter reclaim */ + if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + return; + + /* We're only interested __GFP_FS allocations for now */ + if (!(gfp_mask & __GFP_FS)) + return; + + /* + * Oi! Can't be having __GFP_FS allocations with IRQs disabled. + */ + if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) + return; + + mark_held_locks(curr, RECLAIM_FS); +} + +static void check_flags(unsigned long flags); + +void lockdep_trace_alloc(gfp_t gfp_mask) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lockdep_trace_alloc(gfp_mask, flags); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} + +static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) +{ + /* + * If non-trylock use in a hardirq or softirq context, then + * mark the lock as used in these contexts: + */ + if (!hlock->trylock) { + if (hlock->read) { + if (curr->hardirq_context) + if (!mark_lock(curr, hlock, + LOCK_USED_IN_HARDIRQ_READ)) + return 0; + if (curr->softirq_context) + if (!mark_lock(curr, hlock, + LOCK_USED_IN_SOFTIRQ_READ)) + return 0; + } else { + if (curr->hardirq_context) + if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) + return 0; + if (curr->softirq_context) + if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) + return 0; + } + } + if (!hlock->hardirqs_off) { + if (hlock->read) { + if (!mark_lock(curr, hlock, + LOCK_ENABLED_HARDIRQ_READ)) + return 0; + if (curr->softirqs_enabled) + if (!mark_lock(curr, hlock, + LOCK_ENABLED_SOFTIRQ_READ)) + return 0; + } else { + if (!mark_lock(curr, hlock, + LOCK_ENABLED_HARDIRQ)) + return 0; + if (curr->softirqs_enabled) + if (!mark_lock(curr, hlock, + LOCK_ENABLED_SOFTIRQ)) + return 0; + } + } + + /* + * We reuse the irq context infrastructure more broadly as a general + * context checking code. This tests GFP_FS recursion (a lock taken + * during reclaim for a GFP_FS allocation is held over a GFP_FS + * allocation). + */ + if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { + if (hlock->read) { + if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) + return 0; + } else { + if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) + return 0; + } + } + + return 1; +} + +static int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + unsigned int depth = curr->lockdep_depth; + + /* + * Keep track of points where we cross into an interrupt context: + */ + hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + + curr->softirq_context; + if (depth) { + struct held_lock *prev_hlock; + + prev_hlock = curr->held_locks + depth-1; + /* + * If we cross into another context, reset the + * hash key (this also prevents the checking and the + * adding of the dependency to 'prev'): + */ + if (prev_hlock->irq_context != hlock->irq_context) + return 1; + } + return 0; +} + +#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ + +static inline +int mark_lock_irq(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) +{ + WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ + return 1; +} + +static inline int mark_irqflags(struct task_struct *curr, + struct held_lock *hlock) +{ + return 1; +} + +static inline int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + return 0; +} + +void lockdep_trace_alloc(gfp_t gfp_mask) +{ +} + +#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ + +/* + * Mark a lock with a usage bit, and validate the state transition: + */ +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) +{ + unsigned int new_mask = 1 << new_bit, ret = 1; + + /* + * If already set then do not dirty the cacheline, + * nor do any checks: + */ + if (likely(hlock_class(this)->usage_mask & new_mask)) + return 1; + + if (!graph_lock()) + return 0; + /* + * Make sure we didn't race: + */ + if (unlikely(hlock_class(this)->usage_mask & new_mask)) { + graph_unlock(); + return 1; + } + + hlock_class(this)->usage_mask |= new_mask; + + if (!save_trace(hlock_class(this)->usage_traces + new_bit)) + return 0; + + switch (new_bit) { +#define LOCKDEP_STATE(__STATE) \ + case LOCK_USED_IN_##__STATE: \ + case LOCK_USED_IN_##__STATE##_READ: \ + case LOCK_ENABLED_##__STATE: \ + case LOCK_ENABLED_##__STATE##_READ: +#include "lockdep_states.h" +#undef LOCKDEP_STATE + ret = mark_lock_irq(curr, this, new_bit); + if (!ret) + return 0; + break; + case LOCK_USED: + debug_atomic_dec(nr_unused_locks); + break; + default: + if (!debug_locks_off_graph_unlock()) + return 0; + WARN_ON(1); + return 0; + } + + graph_unlock(); + + /* + * We must printk outside of the graph_lock: + */ + if (ret == 2) { + printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); + print_lock(this); + print_irqtrace_events(curr); + dump_stack(); + } + + return ret; +} + +/* + * Initialize a lock instance's lock-class mapping info: + */ +void lockdep_init_map(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass) +{ + int i; + + kmemcheck_mark_initialized(lock, sizeof(*lock)); + + for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) + lock->class_cache[i] = NULL; + +#ifdef CONFIG_LOCK_STAT + lock->cpu = raw_smp_processor_id(); +#endif + + /* + * Can't be having no nameless bastards around this place! + */ + if (DEBUG_LOCKS_WARN_ON(!name)) { + lock->name = "NULL"; + return; + } + + lock->name = name; + + /* + * No key, no joy, we need to hash something. + */ + if (DEBUG_LOCKS_WARN_ON(!key)) + return; + /* + * Sanity check, the lock-class key must be persistent: + */ + if (!static_obj(key)) { + printk("BUG: key %p not in .data!\n", key); + /* + * What it says above ^^^^^, I suggest you read it. + */ + DEBUG_LOCKS_WARN_ON(1); + return; + } + lock->key = key; + + if (unlikely(!debug_locks)) + return; + + if (subclass) + register_lock_class(lock, subclass, 1); +} +EXPORT_SYMBOL_GPL(lockdep_init_map); + +struct lock_class_key __lockdep_no_validate__; +EXPORT_SYMBOL_GPL(__lockdep_no_validate__); + +static int +print_lock_nested_lock_not_held(struct task_struct *curr, + struct held_lock *hlock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n"); + printk("==================================\n"); + printk("[ BUG: Nested lock was not taken ]\n"); + print_kernel_ident(); + printk("----------------------------------\n"); + + printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); + print_lock(hlock); + + printk("\nbut this task is not holding:\n"); + printk("%s\n", hlock->nest_lock->name); + + printk("\nstack backtrace:\n"); + dump_stack(); + + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static int __lock_is_held(struct lockdep_map *lock); + +/* + * This gets called for every mutex_lock*()/spin_lock*() operation. + * We maintain the dependency maps and validate the locking attempt: + */ +static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, int hardirqs_off, + struct lockdep_map *nest_lock, unsigned long ip, + int references) +{ + struct task_struct *curr = current; + struct lock_class *class = NULL; + struct held_lock *hlock; + unsigned int depth, id; + int chain_head = 0; + int class_idx; + u64 chain_key; + + if (!prove_locking) + check = 1; + + if (unlikely(!debug_locks)) + return 0; + + /* + * Lockdep should run with IRQs disabled, otherwise we could + * get an interrupt which would want to take locks, which would + * end up in lockdep and have you got a head-ache already? + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + + if (lock->key == &__lockdep_no_validate__) + check = 1; + + if (subclass < NR_LOCKDEP_CACHING_CLASSES) + class = lock->class_cache[subclass]; + /* + * Not cached? + */ + if (unlikely(!class)) { + class = register_lock_class(lock, subclass, 0); + if (!class) + return 0; + } + atomic_inc((atomic_t *)&class->ops); + if (very_verbose(class)) { + printk("\nacquire class [%p] %s", class->key, class->name); + if (class->name_version > 1) + printk("#%d", class->name_version); + printk("\n"); + dump_stack(); + } + + /* + * Add the lock to the list of currently held locks. + * (we dont increase the depth just yet, up until the + * dependency checks are done) + */ + depth = curr->lockdep_depth; + /* + * Ran out of static storage for our per-task lock stack again have we? + */ + if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) + return 0; + + class_idx = class - lock_classes + 1; + + if (depth) { + hlock = curr->held_locks + depth - 1; + if (hlock->class_idx == class_idx && nest_lock) { + if (hlock->references) + hlock->references++; + else + hlock->references = 2; + + return 1; + } + } + + hlock = curr->held_locks + depth; + /* + * Plain impossible, we just registered it and checked it weren't no + * NULL like.. I bet this mushroom I ate was good! + */ + if (DEBUG_LOCKS_WARN_ON(!class)) + return 0; + hlock->class_idx = class_idx; + hlock->acquire_ip = ip; + hlock->instance = lock; + hlock->nest_lock = nest_lock; + hlock->trylock = trylock; + hlock->read = read; + hlock->check = check; + hlock->hardirqs_off = !!hardirqs_off; + hlock->references = references; +#ifdef CONFIG_LOCK_STAT + hlock->waittime_stamp = 0; + hlock->holdtime_stamp = lockstat_clock(); +#endif + + if (check == 2 && !mark_irqflags(curr, hlock)) + return 0; + + /* mark it as used: */ + if (!mark_lock(curr, hlock, LOCK_USED)) + return 0; + + /* + * Calculate the chain hash: it's the combined hash of all the + * lock keys along the dependency chain. We save the hash value + * at every step so that we can get the current hash easily + * after unlock. The chain hash is then used to cache dependency + * results. + * + * The 'key ID' is what is the most compact key value to drive + * the hash, not class->key. + */ + id = class - lock_classes; + /* + * Whoops, we did it again.. ran straight out of our static allocation. + */ + if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + return 0; + + chain_key = curr->curr_chain_key; + if (!depth) { + /* + * How can we have a chain hash when we ain't got no keys?! + */ + if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) + return 0; + chain_head = 1; + } + + hlock->prev_chain_key = chain_key; + if (separate_irq_context(curr, hlock)) { + chain_key = 0; + chain_head = 1; + } + chain_key = iterate_chain_key(chain_key, id); + + if (nest_lock && !__lock_is_held(nest_lock)) + return print_lock_nested_lock_not_held(curr, hlock, ip); + + if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) + return 0; + + curr->curr_chain_key = chain_key; + curr->lockdep_depth++; + check_chain_key(curr); +#ifdef CONFIG_DEBUG_LOCKDEP + if (unlikely(!debug_locks)) + return 0; +#endif + if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { + debug_locks_off(); + print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); + printk(KERN_DEBUG "depth: %i max: %lu!\n", + curr->lockdep_depth, MAX_LOCK_DEPTH); + + lockdep_print_held_locks(current); + debug_show_all_locks(); + dump_stack(); + + return 0; + } + + if (unlikely(curr->lockdep_depth > max_lockdep_depth)) + max_lockdep_depth = curr->lockdep_depth; + + return 1; +} + +static int +print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n"); + printk("=====================================\n"); + printk("[ BUG: bad unlock balance detected! ]\n"); + print_kernel_ident(); + printk("-------------------------------------\n"); + printk("%s/%d is trying to release lock (", + curr->comm, task_pid_nr(curr)); + print_lockdep_cache(lock); + printk(") at:\n"); + print_ip_sym(ip); + printk("but there are no more locks to release!\n"); + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Common debugging checks for both nested and non-nested unlock: + */ +static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, + unsigned long ip) +{ + if (unlikely(!debug_locks)) + return 0; + /* + * Lockdep should run with IRQs disabled, recursion, head-ache, etc.. + */ + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + + if (curr->lockdep_depth <= 0) + return print_unlock_imbalance_bug(curr, lock, ip); + + return 1; +} + +static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) +{ + if (hlock->instance == lock) + return 1; + + if (hlock->references) { + struct lock_class *class = lock->class_cache[0]; + + if (!class) + class = look_up_lock_class(lock, 0); + + /* + * If look_up_lock_class() failed to find a class, we're trying + * to test if we hold a lock that has never yet been acquired. + * Clearly if the lock hasn't been acquired _ever_, we're not + * holding it either, so report failure. + */ + if (!class) + return 0; + + /* + * References, but not a lock we're actually ref-counting? + * State got messed up, follow the sites that change ->references + * and try to make sense of it. + */ + if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) + return 0; + + if (hlock->class_idx == class - lock_classes + 1) + return 1; + } + + return 0; +} + +static int +__lock_set_class(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, unsigned int subclass, + unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class *class; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + /* + * This function is about (re)setting the class of a held lock, + * yet we're not actually holding any locks. Naughty user! + */ + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (match_held_lock(hlock, lock)) + goto found_it; + prev_hlock = hlock; + } + return print_unlock_imbalance_bug(curr, lock, ip); + +found_it: + lockdep_init_map(lock, name, key, 0); + class = register_lock_class(lock, subclass, 0); + hlock->class_idx = class - lock_classes + 1; + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + for (; i < depth; i++) { + hlock = curr->held_locks + i; + if (!__lock_acquire(hlock->instance, + hlock_class(hlock)->subclass, hlock->trylock, + hlock->read, hlock->check, hlock->hardirqs_off, + hlock->nest_lock, hlock->acquire_ip, + hlock->references)) + return 0; + } + + /* + * I took it apart and put it back together again, except now I have + * these 'spare' parts.. where shall I put them. + */ + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + +/* + * Remove the lock to the list of currently held locks in a + * potentially non-nested (out of order) manner. This is a + * relatively rare operation, as all the unlock APIs default + * to nested mode (which uses lock_release()): + */ +static int +lock_release_non_nested(struct task_struct *curr, + struct lockdep_map *lock, unsigned long ip) +{ + struct held_lock *hlock, *prev_hlock; + unsigned int depth; + int i; + + /* + * Check whether the lock exists in the current stack + * of held locks: + */ + depth = curr->lockdep_depth; + /* + * So we're all set to release this lock.. wait what lock? We don't + * own any locks, you've been drinking again? + */ + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (match_held_lock(hlock, lock)) + goto found_it; + prev_hlock = hlock; + } + return print_unlock_imbalance_bug(curr, lock, ip); + +found_it: + if (hlock->instance == lock) + lock_release_holdtime(hlock); + + if (hlock->references) { + hlock->references--; + if (hlock->references) { + /* + * We had, and after removing one, still have + * references, the current lock stack is still + * valid. We're done! + */ + return 1; + } + } + + /* + * We have the right lock to unlock, 'hlock' points to it. + * Now we remove it from the stack, and add back the other + * entries (if any), recalculating the hash along the way: + */ + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + for (i++; i < depth; i++) { + hlock = curr->held_locks + i; + if (!__lock_acquire(hlock->instance, + hlock_class(hlock)->subclass, hlock->trylock, + hlock->read, hlock->check, hlock->hardirqs_off, + hlock->nest_lock, hlock->acquire_ip, + hlock->references)) + return 0; + } + + /* + * We had N bottles of beer on the wall, we drank one, but now + * there's not N-1 bottles of beer left on the wall... + */ + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) + return 0; + return 1; +} + +/* + * Remove the lock to the list of currently held locks - this gets + * called on mutex_unlock()/spin_unlock*() (or on a failed + * mutex_lock_interruptible()). This is done for unlocks that nest + * perfectly. (i.e. the current top of the lock-stack is unlocked) + */ +static int lock_release_nested(struct task_struct *curr, + struct lockdep_map *lock, unsigned long ip) +{ + struct held_lock *hlock; + unsigned int depth; + + /* + * Pop off the top of the lock stack: + */ + depth = curr->lockdep_depth - 1; + hlock = curr->held_locks + depth; + + /* + * Is the unlock non-nested: + */ + if (hlock->instance != lock || hlock->references) + return lock_release_non_nested(curr, lock, ip); + curr->lockdep_depth--; + + /* + * No more locks, but somehow we've got hash left over, who left it? + */ + if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) + return 0; + + curr->curr_chain_key = hlock->prev_chain_key; + + lock_release_holdtime(hlock); + +#ifdef CONFIG_DEBUG_LOCKDEP + hlock->prev_chain_key = 0; + hlock->class_idx = 0; + hlock->acquire_ip = 0; + hlock->irq_context = 0; +#endif + return 1; +} + +/* + * Remove the lock to the list of currently held locks - this gets + * called on mutex_unlock()/spin_unlock*() (or on a failed + * mutex_lock_interruptible()). This is done for unlocks that nest + * perfectly. (i.e. the current top of the lock-stack is unlocked) + */ +static void +__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +{ + struct task_struct *curr = current; + + if (!check_unlock(curr, lock, ip)) + return; + + if (nested) { + if (!lock_release_nested(curr, lock, ip)) + return; + } else { + if (!lock_release_non_nested(curr, lock, ip)) + return; + } + + check_chain_key(curr); +} + +static int __lock_is_held(struct lockdep_map *lock) +{ + struct task_struct *curr = current; + int i; + + for (i = 0; i < curr->lockdep_depth; i++) { + struct held_lock *hlock = curr->held_locks + i; + + if (match_held_lock(hlock, lock)) + return 1; + } + + return 0; +} + +/* + * Check whether we follow the irq-flags state precisely: + */ +static void check_flags(unsigned long flags) +{ +#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ + defined(CONFIG_TRACE_IRQFLAGS) + if (!debug_locks) + return; + + if (irqs_disabled_flags(flags)) { + if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { + printk("possible reason: unannotated irqs-off.\n"); + } + } else { + if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { + printk("possible reason: unannotated irqs-on.\n"); + } + } + + /* + * We dont accurately track softirq state in e.g. + * hardirq contexts (such as on 4KSTACKS), so only + * check if not in hardirq contexts: + */ + if (!hardirq_count()) { + if (softirq_count()) { + /* like the above, but with softirqs */ + DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); + } else { + /* lick the above, does it taste good? */ + DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); + } + } + + if (!debug_locks) + print_irqtrace_events(current); +#endif +} + +void lock_set_class(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, unsigned int subclass, + unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_set_class(lock, name, key, subclass, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_set_class); + +/* + * We are not always called with irqs disabled - do that here, + * and also avoid lockdep recursion: + */ +void lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *nest_lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); + __lock_acquire(lock, subclass, trylock, read, check, + irqs_disabled_flags(flags), nest_lock, ip, 0); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_acquire); + +void lock_release(struct lockdep_map *lock, int nested, + unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + trace_lock_release(lock, ip); + __lock_release(lock, nested, ip); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_release); + +int lock_is_held(struct lockdep_map *lock) +{ + unsigned long flags; + int ret = 0; + + if (unlikely(current->lockdep_recursion)) + return 1; /* avoid false negative lockdep_assert_held() */ + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + ret = __lock_is_held(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(lock_is_held); + +void lockdep_set_current_reclaim_state(gfp_t gfp_mask) +{ + current->lockdep_reclaim_gfp = gfp_mask; +} + +void lockdep_clear_current_reclaim_state(void) +{ + current->lockdep_reclaim_gfp = 0; +} + +#ifdef CONFIG_LOCK_STAT +static int +print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n"); + printk("=================================\n"); + printk("[ BUG: bad contention detected! ]\n"); + print_kernel_ident(); + printk("---------------------------------\n"); + printk("%s/%d is trying to contend lock (", + curr->comm, task_pid_nr(curr)); + print_lockdep_cache(lock); + printk(") at:\n"); + print_ip_sym(ip); + printk("but there are no locks held!\n"); + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static void +__lock_contended(struct lockdep_map *lock, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class_stats *stats; + unsigned int depth; + int i, contention_point, contending_point; + + depth = curr->lockdep_depth; + /* + * Whee, we contended on this lock, except it seems we're not + * actually trying to acquire anything much at all.. + */ + if (DEBUG_LOCKS_WARN_ON(!depth)) + return; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (match_held_lock(hlock, lock)) + goto found_it; + prev_hlock = hlock; + } + print_lock_contention_bug(curr, lock, ip); + return; + +found_it: + if (hlock->instance != lock) + return; + + hlock->waittime_stamp = lockstat_clock(); + + contention_point = lock_point(hlock_class(hlock)->contention_point, ip); + contending_point = lock_point(hlock_class(hlock)->contending_point, + lock->ip); + + stats = get_lock_stats(hlock_class(hlock)); + if (contention_point < LOCKSTAT_POINTS) + stats->contention_point[contention_point]++; + if (contending_point < LOCKSTAT_POINTS) + stats->contending_point[contending_point]++; + if (lock->cpu != smp_processor_id()) + stats->bounces[bounce_contended + !!hlock->read]++; + put_lock_stats(stats); +} + +static void +__lock_acquired(struct lockdep_map *lock, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class_stats *stats; + unsigned int depth; + u64 now, waittime = 0; + int i, cpu; + + depth = curr->lockdep_depth; + /* + * Yay, we acquired ownership of this lock we didn't try to + * acquire, how the heck did that happen? + */ + if (DEBUG_LOCKS_WARN_ON(!depth)) + return; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (match_held_lock(hlock, lock)) + goto found_it; + prev_hlock = hlock; + } + print_lock_contention_bug(curr, lock, _RET_IP_); + return; + +found_it: + if (hlock->instance != lock) + return; + + cpu = smp_processor_id(); + if (hlock->waittime_stamp) { + now = lockstat_clock(); + waittime = now - hlock->waittime_stamp; + hlock->holdtime_stamp = now; + } + + trace_lock_acquired(lock, ip); + + stats = get_lock_stats(hlock_class(hlock)); + if (waittime) { + if (hlock->read) + lock_time_inc(&stats->read_waittime, waittime); + else + lock_time_inc(&stats->write_waittime, waittime); + } + if (lock->cpu != cpu) + stats->bounces[bounce_acquired + !!hlock->read]++; + put_lock_stats(stats); + + lock->cpu = cpu; + lock->ip = ip; +} + +void lock_contended(struct lockdep_map *lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lock_stat)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + trace_lock_contended(lock, ip); + __lock_contended(lock, ip); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_contended); + +void lock_acquired(struct lockdep_map *lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lock_stat)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lock_acquired(lock, ip); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_acquired); +#endif + +/* + * Used by the testsuite, sanitize the validator state + * after a simulated failure: + */ + +void lockdep_reset(void) +{ + unsigned long flags; + int i; + + raw_local_irq_save(flags); + current->curr_chain_key = 0; + current->lockdep_depth = 0; + current->lockdep_recursion = 0; + memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); + nr_hardirq_chains = 0; + nr_softirq_chains = 0; + nr_process_chains = 0; + debug_locks = 1; + for (i = 0; i < CHAINHASH_SIZE; i++) + INIT_LIST_HEAD(chainhash_table + i); + raw_local_irq_restore(flags); +} + +static void zap_class(struct lock_class *class) +{ + int i; + + /* + * Remove all dependencies this lock is + * involved in: + */ + for (i = 0; i < nr_list_entries; i++) { + if (list_entries[i].class == class) + list_del_rcu(&list_entries[i].entry); + } + /* + * Unhash the class and remove it from the all_lock_classes list: + */ + list_del_rcu(&class->hash_entry); + list_del_rcu(&class->lock_entry); + + class->key = NULL; +} + +static inline int within(const void *addr, void *start, unsigned long size) +{ + return addr >= start && addr < start + size; +} + +void lockdep_free_key_range(void *start, unsigned long size) +{ + struct lock_class *class, *next; + struct list_head *head; + unsigned long flags; + int i; + int locked; + + raw_local_irq_save(flags); + locked = graph_lock(); + + /* + * Unhash all classes that were created by this module: + */ + for (i = 0; i < CLASSHASH_SIZE; i++) { + head = classhash_table + i; + if (list_empty(head)) + continue; + list_for_each_entry_safe(class, next, head, hash_entry) { + if (within(class->key, start, size)) + zap_class(class); + else if (within(class->name, start, size)) + zap_class(class); + } + } + + if (locked) + graph_unlock(); + raw_local_irq_restore(flags); +} + +void lockdep_reset_lock(struct lockdep_map *lock) +{ + struct lock_class *class, *next; + struct list_head *head; + unsigned long flags; + int i, j; + int locked; + + raw_local_irq_save(flags); + + /* + * Remove all classes this lock might have: + */ + for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { + /* + * If the class exists we look it up and zap it: + */ + class = look_up_lock_class(lock, j); + if (class) + zap_class(class); + } + /* + * Debug check: in the end all mapped classes should + * be gone. + */ + locked = graph_lock(); + for (i = 0; i < CLASSHASH_SIZE; i++) { + head = classhash_table + i; + if (list_empty(head)) + continue; + list_for_each_entry_safe(class, next, head, hash_entry) { + int match = 0; + + for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) + match |= class == lock->class_cache[j]; + + if (unlikely(match)) { + if (debug_locks_off_graph_unlock()) { + /* + * We all just reset everything, how did it match? + */ + WARN_ON(1); + } + goto out_restore; + } + } + } + if (locked) + graph_unlock(); + +out_restore: + raw_local_irq_restore(flags); +} + +void lockdep_init(void) +{ + int i; + + /* + * Some architectures have their own start_kernel() + * code which calls lockdep_init(), while we also + * call lockdep_init() from the start_kernel() itself, + * and we want to initialize the hashes only once: + */ + if (lockdep_initialized) + return; + + for (i = 0; i < CLASSHASH_SIZE; i++) + INIT_LIST_HEAD(classhash_table + i); + + for (i = 0; i < CHAINHASH_SIZE; i++) + INIT_LIST_HEAD(chainhash_table + i); + + lockdep_initialized = 1; +} + +void __init lockdep_info(void) +{ + printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); + + printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); + printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); + printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); + printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); + printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); + printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); + printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); + + printk(" memory used by lock dependency info: %lu kB\n", + (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + + sizeof(struct list_head) * CLASSHASH_SIZE + + sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + + sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + + sizeof(struct list_head) * CHAINHASH_SIZE +#ifdef CONFIG_PROVE_LOCKING + + sizeof(struct circular_queue) +#endif + ) / 1024 + ); + + printk(" per task-struct memory footprint: %lu bytes\n", + sizeof(struct held_lock) * MAX_LOCK_DEPTH); + +#ifdef CONFIG_DEBUG_LOCKDEP + if (lockdep_init_error) { + printk("WARNING: lockdep init error! lock-%s was acquired" + "before lockdep_init\n", lock_init_error); + printk("Call stack leading to lockdep invocation was:\n"); + print_stack_trace(&lockdep_init_trace, 0); + } +#endif +} + +static void +print_freed_lock_bug(struct task_struct *curr, const void *mem_from, + const void *mem_to, struct held_lock *hlock) +{ + if (!debug_locks_off()) + return; + if (debug_locks_silent) + return; + + printk("\n"); + printk("=========================\n"); + printk("[ BUG: held lock freed! ]\n"); + print_kernel_ident(); + printk("-------------------------\n"); + printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + curr->comm, task_pid_nr(curr), mem_from, mem_to-1); + print_lock(hlock); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); +} + +static inline int not_in_range(const void* mem_from, unsigned long mem_len, + const void* lock_from, unsigned long lock_len) +{ + return lock_from + lock_len <= mem_from || + mem_from + mem_len <= lock_from; +} + +/* + * Called when kernel memory is freed (or unmapped), or if a lock + * is destroyed or reinitialized - this code checks whether there is + * any held lock in the memory range of to : + */ +void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) +{ + struct task_struct *curr = current; + struct held_lock *hlock; + unsigned long flags; + int i; + + if (unlikely(!debug_locks)) + return; + + local_irq_save(flags); + for (i = 0; i < curr->lockdep_depth; i++) { + hlock = curr->held_locks + i; + + if (not_in_range(mem_from, mem_len, hlock->instance, + sizeof(*hlock->instance))) + continue; + + print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock); + break; + } + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); + +static void print_held_locks_bug(void) +{ + if (!debug_locks_off()) + return; + if (debug_locks_silent) + return; + + printk("\n"); + printk("=====================================\n"); + printk("[ BUG: %s/%d still has locks held! ]\n", + current->comm, task_pid_nr(current)); + print_kernel_ident(); + printk("-------------------------------------\n"); + lockdep_print_held_locks(current); + printk("\nstack backtrace:\n"); + dump_stack(); +} + +void debug_check_no_locks_held(void) +{ + if (unlikely(current->lockdep_depth > 0)) + print_held_locks_bug(); +} +EXPORT_SYMBOL_GPL(debug_check_no_locks_held); + +void debug_show_all_locks(void) +{ + struct task_struct *g, *p; + int count = 10; + int unlock = 1; + + if (unlikely(!debug_locks)) { + printk("INFO: lockdep is turned off.\n"); + return; + } + printk("\nShowing all locks held in the system:\n"); + + /* + * Here we try to get the tasklist_lock as hard as possible, + * if not successful after 2 seconds we ignore it (but keep + * trying). This is to enable a debug printout even if a + * tasklist_lock-holding task deadlocks or crashes. + */ +retry: + if (!read_trylock(&tasklist_lock)) { + if (count == 10) + printk("hm, tasklist_lock locked, retrying... "); + if (count) { + count--; + printk(" #%d", 10-count); + mdelay(200); + goto retry; + } + printk(" ignoring it.\n"); + unlock = 0; + } else { + if (count != 10) + printk(KERN_CONT " locked it.\n"); + } + + do_each_thread(g, p) { + /* + * It's not reliable to print a task's held locks + * if it's not sleeping (or if it's not the current + * task): + */ + if (p->state == TASK_RUNNING && p != current) + continue; + if (p->lockdep_depth) + lockdep_print_held_locks(p); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + printk("\n"); + printk("=============================================\n\n"); + + if (unlock) + read_unlock(&tasklist_lock); +} +EXPORT_SYMBOL_GPL(debug_show_all_locks); + +/* + * Careful: only use this function if you are sure that + * the task cannot run in parallel! + */ +void debug_show_held_locks(struct task_struct *task) +{ + if (unlikely(!debug_locks)) { + printk("INFO: lockdep is turned off.\n"); + return; + } + lockdep_print_held_locks(task); +} +EXPORT_SYMBOL_GPL(debug_show_held_locks); + +void lockdep_sys_exit(void) +{ + struct task_struct *curr = current; + + if (unlikely(curr->lockdep_depth)) { + if (!debug_locks_off()) + return; + printk("\n"); + printk("================================================\n"); + printk("[ BUG: lock held when returning to user space! ]\n"); + print_kernel_ident(); + printk("------------------------------------------------\n"); + printk("%s/%d is leaving the kernel with locks still held!\n", + curr->comm, curr->pid); + lockdep_print_held_locks(curr); + } +} + +void lockdep_rcu_suspicious(const char *file, const int line, const char *s) +{ + struct task_struct *curr = current; + +#ifndef CONFIG_PROVE_RCU_REPEATEDLY + if (!debug_locks_off()) + return; +#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ + /* Note: the following can be executed concurrently, so be careful. */ + printk("\n"); + printk("===============================\n"); + printk("[ INFO: suspicious RCU usage. ]\n"); + print_kernel_ident(); + printk("-------------------------------\n"); + printk("%s:%d %s!\n", file, line, s); + printk("\nother info that might help us debug this:\n\n"); + printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + !rcu_lockdep_current_cpu_online() + ? "RCU used illegally from offline CPU!\n" + : !rcu_is_watching() + ? "RCU used illegally from idle CPU!\n" + : "", + rcu_scheduler_active, debug_locks); + + /* + * If a CPU is in the RCU-free window in idle (ie: in the section + * between rcu_idle_enter() and rcu_idle_exit(), then RCU + * considers that CPU to be in an "extended quiescent state", + * which means that RCU will be completely ignoring that CPU. + * Therefore, rcu_read_lock() and friends have absolutely no + * effect on a CPU running in that state. In other words, even if + * such an RCU-idle CPU has called rcu_read_lock(), RCU might well + * delete data structures out from under it. RCU really has no + * choice here: we need to keep an RCU-free window in idle where + * the CPU may possibly enter into low power mode. This way we can + * notice an extended quiescent state to other CPUs that started a grace + * period. Otherwise we would delay any grace period as long as we run + * in the idle task. + * + * So complain bitterly if someone does call rcu_read_lock(), + * rcu_read_lock_bh() and so on from extended quiescent states. + */ + if (!rcu_is_watching()) + printk("RCU used illegally from extended quiescent state!\n"); + + lockdep_print_held_locks(curr); + printk("\nstack backtrace:\n"); + dump_stack(); +} +EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h new file mode 100644 index 000000000000..4f560cfedc8f --- /dev/null +++ b/kernel/locking/lockdep_internals.h @@ -0,0 +1,170 @@ +/* + * kernel/lockdep_internals.h + * + * Runtime locking correctness validator + * + * lockdep subsystem internal functions and variables. + */ + +/* + * Lock-class usage-state bits: + */ +enum lock_usage_bit { +#define LOCKDEP_STATE(__STATE) \ + LOCK_USED_IN_##__STATE, \ + LOCK_USED_IN_##__STATE##_READ, \ + LOCK_ENABLED_##__STATE, \ + LOCK_ENABLED_##__STATE##_READ, +#include "lockdep_states.h" +#undef LOCKDEP_STATE + LOCK_USED, + LOCK_USAGE_STATES +}; + +/* + * Usage-state bitmasks: + */ +#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE), + +enum { +#define LOCKDEP_STATE(__STATE) \ + __LOCKF(USED_IN_##__STATE) \ + __LOCKF(USED_IN_##__STATE##_READ) \ + __LOCKF(ENABLED_##__STATE) \ + __LOCKF(ENABLED_##__STATE##_READ) +#include "lockdep_states.h" +#undef LOCKDEP_STATE + __LOCKF(USED) +}; + +#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) +#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) + +#define LOCKF_ENABLED_IRQ_READ \ + (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) +#define LOCKF_USED_IN_IRQ_READ \ + (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) + +/* + * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies + * we track. + * + * We use the per-lock dependency maps in two ways: we grow it by adding + * every to-be-taken lock to all currently held lock's own dependency + * table (if it's not there yet), and we check it for lock order + * conflicts and deadlocks. + */ +#define MAX_LOCKDEP_ENTRIES 16384UL + +#define MAX_LOCKDEP_CHAINS_BITS 15 +#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) + +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) + +/* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the hash_lock. + */ +#define MAX_STACK_TRACE_ENTRIES 262144UL + +extern struct list_head all_lock_classes; +extern struct lock_chain lock_chains[]; + +#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) + +extern void get_usage_chars(struct lock_class *class, + char usage[LOCK_USAGE_CHARS]); + +extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); + +struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); + +extern unsigned long nr_lock_classes; +extern unsigned long nr_list_entries; +extern unsigned long nr_lock_chains; +extern int nr_chain_hlocks; +extern unsigned long nr_stack_trace_entries; + +extern unsigned int nr_hardirq_chains; +extern unsigned int nr_softirq_chains; +extern unsigned int nr_process_chains; +extern unsigned int max_lockdep_depth; +extern unsigned int max_recursion_depth; + +extern unsigned int max_bfs_queue_depth; + +#ifdef CONFIG_PROVE_LOCKING +extern unsigned long lockdep_count_forward_deps(struct lock_class *); +extern unsigned long lockdep_count_backward_deps(struct lock_class *); +#else +static inline unsigned long +lockdep_count_forward_deps(struct lock_class *class) +{ + return 0; +} +static inline unsigned long +lockdep_count_backward_deps(struct lock_class *class) +{ + return 0; +} +#endif + +#ifdef CONFIG_DEBUG_LOCKDEP + +#include +/* + * Various lockdep statistics. + * We want them per cpu as they are often accessed in fast path + * and we want to avoid too much cache bouncing. + */ +struct lockdep_stats { + int chain_lookup_hits; + int chain_lookup_misses; + int hardirqs_on_events; + int hardirqs_off_events; + int redundant_hardirqs_on; + int redundant_hardirqs_off; + int softirqs_on_events; + int softirqs_off_events; + int redundant_softirqs_on; + int redundant_softirqs_off; + int nr_unused_locks; + int nr_cyclic_checks; + int nr_cyclic_check_recursions; + int nr_find_usage_forwards_checks; + int nr_find_usage_forwards_recursions; + int nr_find_usage_backwards_checks; + int nr_find_usage_backwards_recursions; +}; + +DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); + +#define __debug_atomic_inc(ptr) \ + this_cpu_inc(lockdep_stats.ptr); + +#define debug_atomic_inc(ptr) { \ + WARN_ON_ONCE(!irqs_disabled()); \ + __this_cpu_inc(lockdep_stats.ptr); \ +} + +#define debug_atomic_dec(ptr) { \ + WARN_ON_ONCE(!irqs_disabled()); \ + __this_cpu_dec(lockdep_stats.ptr); \ +} + +#define debug_atomic_read(ptr) ({ \ + struct lockdep_stats *__cpu_lockdep_stats; \ + unsigned long long __total = 0; \ + int __cpu; \ + for_each_possible_cpu(__cpu) { \ + __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \ + __total += __cpu_lockdep_stats->ptr; \ + } \ + __total; \ +}) +#else +# define __debug_atomic_inc(ptr) do { } while (0) +# define debug_atomic_inc(ptr) do { } while (0) +# define debug_atomic_dec(ptr) do { } while (0) +# define debug_atomic_read(ptr) 0 +#endif diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c new file mode 100644 index 000000000000..09220656d888 --- /dev/null +++ b/kernel/locking/lockdep_proc.c @@ -0,0 +1,683 @@ +/* + * kernel/lockdep_proc.c + * + * Runtime locking correctness validator + * + * Started by Ingo Molnar: + * + * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * Code for /proc/lockdep and /proc/lockdep_stats: + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lockdep_internals.h" + +static void *l_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &all_lock_classes, pos); +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + return seq_list_start_head(&all_lock_classes, *pos); +} + +static void l_stop(struct seq_file *m, void *v) +{ +} + +static void print_name(struct seq_file *m, struct lock_class *class) +{ + char str[KSYM_NAME_LEN]; + const char *name = class->name; + + if (!name) { + name = __get_key_name(class->key, str); + seq_printf(m, "%s", name); + } else{ + seq_printf(m, "%s", name); + if (class->name_version > 1) + seq_printf(m, "#%d", class->name_version); + if (class->subclass) + seq_printf(m, "/%d", class->subclass); + } +} + +static int l_show(struct seq_file *m, void *v) +{ + struct lock_class *class = list_entry(v, struct lock_class, lock_entry); + struct lock_list *entry; + char usage[LOCK_USAGE_CHARS]; + + if (v == &all_lock_classes) { + seq_printf(m, "all lock classes:\n"); + return 0; + } + + seq_printf(m, "%p", class->key); +#ifdef CONFIG_DEBUG_LOCKDEP + seq_printf(m, " OPS:%8ld", class->ops); +#endif +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); + seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); +#endif + + get_usage_chars(class, usage); + seq_printf(m, " %s", usage); + + seq_printf(m, ": "); + print_name(m, class); + seq_puts(m, "\n"); + + list_for_each_entry(entry, &class->locks_after, entry) { + if (entry->distance == 1) { + seq_printf(m, " -> [%p] ", entry->class->key); + print_name(m, entry->class); + seq_puts(m, "\n"); + } + } + seq_puts(m, "\n"); + + return 0; +} + +static const struct seq_operations lockdep_ops = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show, +}; + +static int lockdep_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &lockdep_ops); +} + +static const struct file_operations proc_lockdep_operations = { + .open = lockdep_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#ifdef CONFIG_PROVE_LOCKING +static void *lc_start(struct seq_file *m, loff_t *pos) +{ + if (*pos == 0) + return SEQ_START_TOKEN; + + if (*pos - 1 < nr_lock_chains) + return lock_chains + (*pos - 1); + + return NULL; +} + +static void *lc_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return lc_start(m, pos); +} + +static void lc_stop(struct seq_file *m, void *v) +{ +} + +static int lc_show(struct seq_file *m, void *v) +{ + struct lock_chain *chain = v; + struct lock_class *class; + int i; + + if (v == SEQ_START_TOKEN) { + seq_printf(m, "all lock chains:\n"); + return 0; + } + + seq_printf(m, "irq_context: %d\n", chain->irq_context); + + for (i = 0; i < chain->depth; i++) { + class = lock_chain_get_class(chain, i); + if (!class->key) + continue; + + seq_printf(m, "[%p] ", class->key); + print_name(m, class); + seq_puts(m, "\n"); + } + seq_puts(m, "\n"); + + return 0; +} + +static const struct seq_operations lockdep_chains_ops = { + .start = lc_start, + .next = lc_next, + .stop = lc_stop, + .show = lc_show, +}; + +static int lockdep_chains_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &lockdep_chains_ops); +} + +static const struct file_operations proc_lockdep_chains_operations = { + .open = lockdep_chains_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* CONFIG_PROVE_LOCKING */ + +static void lockdep_stats_debug_show(struct seq_file *m) +{ +#ifdef CONFIG_DEBUG_LOCKDEP + unsigned long long hi1 = debug_atomic_read(hardirqs_on_events), + hi2 = debug_atomic_read(hardirqs_off_events), + hr1 = debug_atomic_read(redundant_hardirqs_on), + hr2 = debug_atomic_read(redundant_hardirqs_off), + si1 = debug_atomic_read(softirqs_on_events), + si2 = debug_atomic_read(softirqs_off_events), + sr1 = debug_atomic_read(redundant_softirqs_on), + sr2 = debug_atomic_read(redundant_softirqs_off); + + seq_printf(m, " chain lookup misses: %11llu\n", + debug_atomic_read(chain_lookup_misses)); + seq_printf(m, " chain lookup hits: %11llu\n", + debug_atomic_read(chain_lookup_hits)); + seq_printf(m, " cyclic checks: %11llu\n", + debug_atomic_read(nr_cyclic_checks)); + seq_printf(m, " find-mask forwards checks: %11llu\n", + debug_atomic_read(nr_find_usage_forwards_checks)); + seq_printf(m, " find-mask backwards checks: %11llu\n", + debug_atomic_read(nr_find_usage_backwards_checks)); + + seq_printf(m, " hardirq on events: %11llu\n", hi1); + seq_printf(m, " hardirq off events: %11llu\n", hi2); + seq_printf(m, " redundant hardirq ons: %11llu\n", hr1); + seq_printf(m, " redundant hardirq offs: %11llu\n", hr2); + seq_printf(m, " softirq on events: %11llu\n", si1); + seq_printf(m, " softirq off events: %11llu\n", si2); + seq_printf(m, " redundant softirq ons: %11llu\n", sr1); + seq_printf(m, " redundant softirq offs: %11llu\n", sr2); +#endif +} + +static int lockdep_stats_show(struct seq_file *m, void *v) +{ + struct lock_class *class; + unsigned long nr_unused = 0, nr_uncategorized = 0, + nr_irq_safe = 0, nr_irq_unsafe = 0, + nr_softirq_safe = 0, nr_softirq_unsafe = 0, + nr_hardirq_safe = 0, nr_hardirq_unsafe = 0, + nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, + nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, + nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, + sum_forward_deps = 0; + + list_for_each_entry(class, &all_lock_classes, lock_entry) { + + if (class->usage_mask == 0) + nr_unused++; + if (class->usage_mask == LOCKF_USED) + nr_uncategorized++; + if (class->usage_mask & LOCKF_USED_IN_IRQ) + nr_irq_safe++; + if (class->usage_mask & LOCKF_ENABLED_IRQ) + nr_irq_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) + nr_softirq_safe++; + if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ) + nr_softirq_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) + nr_hardirq_safe++; + if (class->usage_mask & LOCKF_ENABLED_HARDIRQ) + nr_hardirq_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) + nr_irq_read_safe++; + if (class->usage_mask & LOCKF_ENABLED_IRQ_READ) + nr_irq_read_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) + nr_softirq_read_safe++; + if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ) + nr_softirq_read_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) + nr_hardirq_read_safe++; + if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) + nr_hardirq_read_unsafe++; + +#ifdef CONFIG_PROVE_LOCKING + sum_forward_deps += lockdep_count_forward_deps(class); +#endif + } +#ifdef CONFIG_DEBUG_LOCKDEP + DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); +#endif + seq_printf(m, " lock-classes: %11lu [max: %lu]\n", + nr_lock_classes, MAX_LOCKDEP_KEYS); + seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", + nr_list_entries, MAX_LOCKDEP_ENTRIES); + seq_printf(m, " indirect dependencies: %11lu\n", + sum_forward_deps); + + /* + * Total number of dependencies: + * + * All irq-safe locks may nest inside irq-unsafe locks, + * plus all the other known dependencies: + */ + seq_printf(m, " all direct dependencies: %11lu\n", + nr_irq_unsafe * nr_irq_safe + + nr_hardirq_unsafe * nr_hardirq_safe + + nr_list_entries); + +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " dependency chains: %11lu [max: %lu]\n", + nr_lock_chains, MAX_LOCKDEP_CHAINS); + seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", + nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS + seq_printf(m, " in-hardirq chains: %11u\n", + nr_hardirq_chains); + seq_printf(m, " in-softirq chains: %11u\n", + nr_softirq_chains); +#endif + seq_printf(m, " in-process chains: %11u\n", + nr_process_chains); + seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", + nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); + seq_printf(m, " combined max dependencies: %11u\n", + (nr_hardirq_chains + 1) * + (nr_softirq_chains + 1) * + (nr_process_chains + 1) + ); + seq_printf(m, " hardirq-safe locks: %11lu\n", + nr_hardirq_safe); + seq_printf(m, " hardirq-unsafe locks: %11lu\n", + nr_hardirq_unsafe); + seq_printf(m, " softirq-safe locks: %11lu\n", + nr_softirq_safe); + seq_printf(m, " softirq-unsafe locks: %11lu\n", + nr_softirq_unsafe); + seq_printf(m, " irq-safe locks: %11lu\n", + nr_irq_safe); + seq_printf(m, " irq-unsafe locks: %11lu\n", + nr_irq_unsafe); + + seq_printf(m, " hardirq-read-safe locks: %11lu\n", + nr_hardirq_read_safe); + seq_printf(m, " hardirq-read-unsafe locks: %11lu\n", + nr_hardirq_read_unsafe); + seq_printf(m, " softirq-read-safe locks: %11lu\n", + nr_softirq_read_safe); + seq_printf(m, " softirq-read-unsafe locks: %11lu\n", + nr_softirq_read_unsafe); + seq_printf(m, " irq-read-safe locks: %11lu\n", + nr_irq_read_safe); + seq_printf(m, " irq-read-unsafe locks: %11lu\n", + nr_irq_read_unsafe); + + seq_printf(m, " uncategorized locks: %11lu\n", + nr_uncategorized); + seq_printf(m, " unused locks: %11lu\n", + nr_unused); + seq_printf(m, " max locking depth: %11u\n", + max_lockdep_depth); +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " max bfs queue depth: %11u\n", + max_bfs_queue_depth); +#endif + lockdep_stats_debug_show(m); + seq_printf(m, " debug_locks: %11u\n", + debug_locks); + + return 0; +} + +static int lockdep_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, lockdep_stats_show, NULL); +} + +static const struct file_operations proc_lockdep_stats_operations = { + .open = lockdep_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_LOCK_STAT + +struct lock_stat_data { + struct lock_class *class; + struct lock_class_stats stats; +}; + +struct lock_stat_seq { + struct lock_stat_data *iter_end; + struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; +}; + +/* + * sort on absolute number of contentions + */ +static int lock_stat_cmp(const void *l, const void *r) +{ + const struct lock_stat_data *dl = l, *dr = r; + unsigned long nl, nr; + + nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr; + nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr; + + return nr - nl; +} + +static void seq_line(struct seq_file *m, char c, int offset, int length) +{ + int i; + + for (i = 0; i < offset; i++) + seq_puts(m, " "); + for (i = 0; i < length; i++) + seq_printf(m, "%c", c); + seq_puts(m, "\n"); +} + +static void snprint_time(char *buf, size_t bufsiz, s64 nr) +{ + s64 div; + s32 rem; + + nr += 5; /* for display rounding */ + div = div_s64_rem(nr, 1000, &rem); + snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10); +} + +static void seq_time(struct seq_file *m, s64 time) +{ + char num[15]; + + snprint_time(num, sizeof(num), time); + seq_printf(m, " %14s", num); +} + +static void seq_lock_time(struct seq_file *m, struct lock_time *lt) +{ + seq_printf(m, "%14lu", lt->nr); + seq_time(m, lt->min); + seq_time(m, lt->max); + seq_time(m, lt->total); + seq_time(m, lt->nr ? do_div(lt->total, lt->nr) : 0); +} + +static void seq_stats(struct seq_file *m, struct lock_stat_data *data) +{ + char name[39]; + struct lock_class *class; + struct lock_class_stats *stats; + int i, namelen; + + class = data->class; + stats = &data->stats; + + namelen = 38; + if (class->name_version > 1) + namelen -= 2; /* XXX truncates versions > 9 */ + if (class->subclass) + namelen -= 2; + + if (!class->name) { + char str[KSYM_NAME_LEN]; + const char *key_name; + + key_name = __get_key_name(class->key, str); + snprintf(name, namelen, "%s", key_name); + } else { + snprintf(name, namelen, "%s", class->name); + } + namelen = strlen(name); + if (class->name_version > 1) { + snprintf(name+namelen, 3, "#%d", class->name_version); + namelen += 2; + } + if (class->subclass) { + snprintf(name+namelen, 3, "/%d", class->subclass); + namelen += 2; + } + + if (stats->write_holdtime.nr) { + if (stats->read_holdtime.nr) + seq_printf(m, "%38s-W:", name); + else + seq_printf(m, "%40s:", name); + + seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]); + seq_lock_time(m, &stats->write_waittime); + seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]); + seq_lock_time(m, &stats->write_holdtime); + seq_puts(m, "\n"); + } + + if (stats->read_holdtime.nr) { + seq_printf(m, "%38s-R:", name); + seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]); + seq_lock_time(m, &stats->read_waittime); + seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]); + seq_lock_time(m, &stats->read_holdtime); + seq_puts(m, "\n"); + } + + if (stats->read_waittime.nr + stats->write_waittime.nr == 0) + return; + + if (stats->read_holdtime.nr) + namelen += 2; + + for (i = 0; i < LOCKSTAT_POINTS; i++) { + char ip[32]; + + if (class->contention_point[i] == 0) + break; + + if (!i) + seq_line(m, '-', 40-namelen, namelen); + + snprintf(ip, sizeof(ip), "[<%p>]", + (void *)class->contention_point[i]); + seq_printf(m, "%40s %14lu %29s %pS\n", + name, stats->contention_point[i], + ip, (void *)class->contention_point[i]); + } + for (i = 0; i < LOCKSTAT_POINTS; i++) { + char ip[32]; + + if (class->contending_point[i] == 0) + break; + + if (!i) + seq_line(m, '-', 40-namelen, namelen); + + snprintf(ip, sizeof(ip), "[<%p>]", + (void *)class->contending_point[i]); + seq_printf(m, "%40s %14lu %29s %pS\n", + name, stats->contending_point[i], + ip, (void *)class->contending_point[i]); + } + if (i) { + seq_puts(m, "\n"); + seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1)); + seq_puts(m, "\n"); + } +} + +static void seq_header(struct seq_file *m) +{ + seq_puts(m, "lock_stat version 0.4\n"); + + if (unlikely(!debug_locks)) + seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); + + seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); + seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s " + "%14s %14s\n", + "class name", + "con-bounces", + "contentions", + "waittime-min", + "waittime-max", + "waittime-total", + "waittime-avg", + "acq-bounces", + "acquisitions", + "holdtime-min", + "holdtime-max", + "holdtime-total", + "holdtime-avg"); + seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); + seq_printf(m, "\n"); +} + +static void *ls_start(struct seq_file *m, loff_t *pos) +{ + struct lock_stat_seq *data = m->private; + struct lock_stat_data *iter; + + if (*pos == 0) + return SEQ_START_TOKEN; + + iter = data->stats + (*pos - 1); + if (iter >= data->iter_end) + iter = NULL; + + return iter; +} + +static void *ls_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return ls_start(m, pos); +} + +static void ls_stop(struct seq_file *m, void *v) +{ +} + +static int ls_show(struct seq_file *m, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_header(m); + else + seq_stats(m, v); + + return 0; +} + +static const struct seq_operations lockstat_ops = { + .start = ls_start, + .next = ls_next, + .stop = ls_stop, + .show = ls_show, +}; + +static int lock_stat_open(struct inode *inode, struct file *file) +{ + int res; + struct lock_class *class; + struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq)); + + if (!data) + return -ENOMEM; + + res = seq_open(file, &lockstat_ops); + if (!res) { + struct lock_stat_data *iter = data->stats; + struct seq_file *m = file->private_data; + + list_for_each_entry(class, &all_lock_classes, lock_entry) { + iter->class = class; + iter->stats = lock_stats(class); + iter++; + } + data->iter_end = iter; + + sort(data->stats, data->iter_end - data->stats, + sizeof(struct lock_stat_data), + lock_stat_cmp, NULL); + + m->private = data; + } else + vfree(data); + + return res; +} + +static ssize_t lock_stat_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct lock_class *class; + char c; + + if (count) { + if (get_user(c, buf)) + return -EFAULT; + + if (c != '0') + return count; + + list_for_each_entry(class, &all_lock_classes, lock_entry) + clear_lock_stats(class); + } + return count; +} + +static int lock_stat_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + vfree(seq->private); + return seq_release(inode, file); +} + +static const struct file_operations proc_lock_stat_operations = { + .open = lock_stat_open, + .write = lock_stat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = lock_stat_release, +}; +#endif /* CONFIG_LOCK_STAT */ + +static int __init lockdep_proc_init(void) +{ + proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); +#ifdef CONFIG_PROVE_LOCKING + proc_create("lockdep_chains", S_IRUSR, NULL, + &proc_lockdep_chains_operations); +#endif + proc_create("lockdep_stats", S_IRUSR, NULL, + &proc_lockdep_stats_operations); + +#ifdef CONFIG_LOCK_STAT + proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, + &proc_lock_stat_operations); +#endif + + return 0; +} + +__initcall(lockdep_proc_init); + diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h new file mode 100644 index 000000000000..995b0cc2b84c --- /dev/null +++ b/kernel/locking/lockdep_states.h @@ -0,0 +1,9 @@ +/* + * Lockdep states, + * + * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever + * you add one, or come up with a nice dynamic solution. + */ +LOCKDEP_STATE(HARDIRQ) +LOCKDEP_STATE(SOFTIRQ) +LOCKDEP_STATE(RECLAIM_FS) -- cgit v1.3 From 60fc28746a7b61775ae28950ddf7a4ac15955639 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 18:15:36 +0100 Subject: locking: Move the spinlock code to kernel/locking/ Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-b81ol0z3mon45m51o131yc9j@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/Makefile | 3 - kernel/locking/Makefile | 4 + kernel/locking/spinlock.c | 399 ++++++++++++++++++++++++++++++++++++++++ kernel/locking/spinlock_debug.c | 302 ++++++++++++++++++++++++++++++ kernel/spinlock.c | 399 ---------------------------------------- lib/Makefile | 1 - lib/spinlock_debug.c | 302 ------------------------------ 7 files changed, 705 insertions(+), 705 deletions(-) create mode 100644 kernel/locking/spinlock.c create mode 100644 kernel/locking/spinlock_debug.c delete mode 100644 kernel/spinlock.c delete mode 100644 lib/spinlock_debug.c (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index 4fffd6ee42c1..4bce165dce5d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -43,9 +43,6 @@ obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif -obj-$(CONFIG_SMP) += spinlock.o -obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o -obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index c103599fc1ba..674d2152d10f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -13,3 +13,7 @@ obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif +obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_PROVE_LOCKING) += spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c new file mode 100644 index 000000000000..4b082b5cac9e --- /dev/null +++ b/kernel/locking/spinlock.c @@ -0,0 +1,399 @@ +/* + * Copyright (2004) Linus Torvalds + * + * Author: Zwane Mwaikambo + * + * Copyright (2004, 2005) Ingo Molnar + * + * This file contains the spinlock/rwlock implementations for the + * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) + * + * Note that some architectures have special knowledge about the + * stack frames of these functions in their profile_pc. If you + * change anything significant here that could change the stack + * frame contact the architecture maintainers. + */ + +#include +#include +#include +#include +#include +#include + +/* + * If lockdep is enabled then we use the non-preemption spin-ops + * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are + * not re-enabled during lock-acquire (which the preempt-spin-ops do): + */ +#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) +/* + * The __lock_function inlines are taken from + * include/linux/spinlock_api_smp.h + */ +#else +#define raw_read_can_lock(l) read_can_lock(l) +#define raw_write_can_lock(l) write_can_lock(l) + +/* + * Some architectures can relax in favour of the CPU owning the lock. + */ +#ifndef arch_read_relax +# define arch_read_relax(l) cpu_relax() +#endif +#ifndef arch_write_relax +# define arch_write_relax(l) cpu_relax() +#endif +#ifndef arch_spin_relax +# define arch_spin_relax(l) cpu_relax() +#endif + +/* + * We build the __lock_function inlines here. They are too large for + * inlining all over the place, but here is only one user per function + * which embedds them into the calling _lock_function below. + * + * This could be a long-held lock. We both prepare to spin for a long + * time (making _this_ CPU preemptable if possible), and we also signal + * towards that other CPU that it should break the lock ASAP. + */ +#define BUILD_LOCK_OPS(op, locktype) \ +void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ +{ \ + for (;;) { \ + preempt_disable(); \ + if (likely(do_raw_##op##_trylock(lock))) \ + break; \ + preempt_enable(); \ + \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + arch_##op##_relax(&lock->raw_lock); \ + } \ + (lock)->break_lock = 0; \ +} \ + \ +unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ +{ \ + unsigned long flags; \ + \ + for (;;) { \ + preempt_disable(); \ + local_irq_save(flags); \ + if (likely(do_raw_##op##_trylock(lock))) \ + break; \ + local_irq_restore(flags); \ + preempt_enable(); \ + \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + arch_##op##_relax(&lock->raw_lock); \ + } \ + (lock)->break_lock = 0; \ + return flags; \ +} \ + \ +void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ +{ \ + _raw_##op##_lock_irqsave(lock); \ +} \ + \ +void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ +{ \ + unsigned long flags; \ + \ + /* */ \ + /* Careful: we must exclude softirqs too, hence the */ \ + /* irq-disabling. We use the generic preemption-aware */ \ + /* function: */ \ + /**/ \ + flags = _raw_##op##_lock_irqsave(lock); \ + local_bh_disable(); \ + local_irq_restore(flags); \ +} \ + +/* + * Build preemption-friendly versions of the following + * lock-spinning functions: + * + * __[spin|read|write]_lock() + * __[spin|read|write]_lock_irq() + * __[spin|read|write]_lock_irqsave() + * __[spin|read|write]_lock_bh() + */ +BUILD_LOCK_OPS(spin, raw_spinlock); +BUILD_LOCK_OPS(read, rwlock); +BUILD_LOCK_OPS(write, rwlock); + +#endif + +#ifndef CONFIG_INLINE_SPIN_TRYLOCK +int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) +{ + return __raw_spin_trylock(lock); +} +EXPORT_SYMBOL(_raw_spin_trylock); +#endif + +#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH +int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) +{ + return __raw_spin_trylock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_trylock_bh); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK +void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) +{ + __raw_spin_lock(lock); +} +EXPORT_SYMBOL(_raw_spin_lock); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE +unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) +{ + return __raw_spin_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ +void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) +{ + __raw_spin_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_irq); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_BH +void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) +{ + __raw_spin_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_bh); +#endif + +#ifdef CONFIG_UNINLINE_SPIN_UNLOCK +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) +{ + __raw_spin_unlock(lock); +} +EXPORT_SYMBOL(_raw_spin_unlock); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE +void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) +{ + __raw_spin_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ +void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) +{ + __raw_spin_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_spin_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH +void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) +{ + __raw_spin_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_unlock_bh); +#endif + +#ifndef CONFIG_INLINE_READ_TRYLOCK +int __lockfunc _raw_read_trylock(rwlock_t *lock) +{ + return __raw_read_trylock(lock); +} +EXPORT_SYMBOL(_raw_read_trylock); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK +void __lockfunc _raw_read_lock(rwlock_t *lock) +{ + __raw_read_lock(lock); +} +EXPORT_SYMBOL(_raw_read_lock); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE +unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) +{ + return __raw_read_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_read_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_IRQ +void __lockfunc _raw_read_lock_irq(rwlock_t *lock) +{ + __raw_read_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_read_lock_irq); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_BH +void __lockfunc _raw_read_lock_bh(rwlock_t *lock) +{ + __raw_read_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_read_lock_bh); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK +void __lockfunc _raw_read_unlock(rwlock_t *lock) +{ + __raw_read_unlock(lock); +} +EXPORT_SYMBOL(_raw_read_unlock); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE +void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + __raw_read_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_read_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ +void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) +{ + __raw_read_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_read_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK_BH +void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) +{ + __raw_read_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_read_unlock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_TRYLOCK +int __lockfunc _raw_write_trylock(rwlock_t *lock) +{ + return __raw_write_trylock(lock); +} +EXPORT_SYMBOL(_raw_write_trylock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK +void __lockfunc _raw_write_lock(rwlock_t *lock) +{ + __raw_write_lock(lock); +} +EXPORT_SYMBOL(_raw_write_lock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE +unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) +{ + return __raw_write_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_write_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ +void __lockfunc _raw_write_lock_irq(rwlock_t *lock) +{ + __raw_write_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_write_lock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_BH +void __lockfunc _raw_write_lock_bh(rwlock_t *lock) +{ + __raw_write_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_write_lock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK +void __lockfunc _raw_write_unlock(rwlock_t *lock) +{ + __raw_write_unlock(lock); +} +EXPORT_SYMBOL(_raw_write_unlock); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE +void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + __raw_write_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_write_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ +void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) +{ + __raw_write_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_write_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH +void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) +{ + __raw_write_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_write_unlock_bh); +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) +{ + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_nested); + +unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, + int subclass) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, + do_raw_spin_lock_flags, &flags); + return flags; +} +EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); + +void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + preempt_disable(); + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_nest_lock); + +#endif + +notrace int in_lock_functions(unsigned long addr) +{ + /* Linker adds these: start and end of __lockfunc functions */ + extern char __lock_text_start[], __lock_text_end[]; + + return addr >= (unsigned long)__lock_text_start + && addr < (unsigned long)__lock_text_end; +} +EXPORT_SYMBOL(in_lock_functions); diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c new file mode 100644 index 000000000000..0374a596cffa --- /dev/null +++ b/kernel/locking/spinlock_debug.c @@ -0,0 +1,302 @@ +/* + * Copyright 2005, Red Hat, Inc., Ingo Molnar + * Released under the General Public License (GPL). + * + * This file contains the spinlock/rwlock implementations for + * DEBUG_SPINLOCK. + */ + +#include +#include +#include +#include +#include +#include + +void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + lock->magic = SPINLOCK_MAGIC; + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +EXPORT_SYMBOL(__raw_spin_lock_init); + +void __rwlock_init(rwlock_t *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; + lock->magic = RWLOCK_MAGIC; + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +EXPORT_SYMBOL(__rwlock_init); + +static void spin_dump(raw_spinlock_t *lock, const char *msg) +{ + struct task_struct *owner = NULL; + + if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) + owner = lock->owner; + printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", + msg, raw_smp_processor_id(), + current->comm, task_pid_nr(current)); + printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " + ".owner_cpu: %d\n", + lock, lock->magic, + owner ? owner->comm : "", + owner ? task_pid_nr(owner) : -1, + lock->owner_cpu); + dump_stack(); +} + +static void spin_bug(raw_spinlock_t *lock, const char *msg) +{ + if (!debug_locks_off()) + return; + + spin_dump(lock, msg); +} + +#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) + +static inline void +debug_spin_lock_before(raw_spinlock_t *lock) +{ + SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(lock->owner == current, lock, "recursion"); + SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + lock, "cpu recursion"); +} + +static inline void debug_spin_lock_after(raw_spinlock_t *lock) +{ + lock->owner_cpu = raw_smp_processor_id(); + lock->owner = current; +} + +static inline void debug_spin_unlock(raw_spinlock_t *lock) +{ + SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); + SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked"); + SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); + SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), + lock, "wrong CPU"); + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +static void __spin_lock_debug(raw_spinlock_t *lock) +{ + u64 i; + u64 loops = loops_per_jiffy * HZ; + + for (i = 0; i < loops; i++) { + if (arch_spin_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + spin_dump(lock, "lockup suspected"); +#ifdef CONFIG_SMP + trigger_all_cpu_backtrace(); +#endif + + /* + * The trylock above was causing a livelock. Give the lower level arch + * specific lock code a chance to acquire the lock. We have already + * printed a warning/backtrace at this point. The non-debug arch + * specific code might actually succeed in acquiring the lock. If it is + * not successful, the end-result is the same - there is no forward + * progress. + */ + arch_spin_lock(&lock->raw_lock); +} + +void do_raw_spin_lock(raw_spinlock_t *lock) +{ + debug_spin_lock_before(lock); + if (unlikely(!arch_spin_trylock(&lock->raw_lock))) + __spin_lock_debug(lock); + debug_spin_lock_after(lock); +} + +int do_raw_spin_trylock(raw_spinlock_t *lock) +{ + int ret = arch_spin_trylock(&lock->raw_lock); + + if (ret) + debug_spin_lock_after(lock); +#ifndef CONFIG_SMP + /* + * Must not happen on UP: + */ + SPIN_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif + return ret; +} + +void do_raw_spin_unlock(raw_spinlock_t *lock) +{ + debug_spin_unlock(lock); + arch_spin_unlock(&lock->raw_lock); +} + +static void rwlock_bug(rwlock_t *lock, const char *msg) +{ + if (!debug_locks_off()) + return; + + printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", + msg, raw_smp_processor_id(), current->comm, + task_pid_nr(current), lock); + dump_stack(); +} + +#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) + +#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ +static void __read_lock_debug(rwlock_t *lock) +{ + u64 i; + u64 loops = loops_per_jiffy * HZ; + int print_once = 1; + + for (;;) { + for (i = 0; i < loops; i++) { + if (arch_read_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + if (print_once) { + print_once = 0; + printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " + "%s/%d, %p\n", + raw_smp_processor_id(), current->comm, + current->pid, lock); + dump_stack(); + } + } +} +#endif + +void do_raw_read_lock(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + arch_read_lock(&lock->raw_lock); +} + +int do_raw_read_trylock(rwlock_t *lock) +{ + int ret = arch_read_trylock(&lock->raw_lock); + +#ifndef CONFIG_SMP + /* + * Must not happen on UP: + */ + RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif + return ret; +} + +void do_raw_read_unlock(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + arch_read_unlock(&lock->raw_lock); +} + +static inline void debug_write_lock_before(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); + RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + lock, "cpu recursion"); +} + +static inline void debug_write_lock_after(rwlock_t *lock) +{ + lock->owner_cpu = raw_smp_processor_id(); + lock->owner = current; +} + +static inline void debug_write_unlock(rwlock_t *lock) +{ + RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); + RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); + RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), + lock, "wrong CPU"); + lock->owner = SPINLOCK_OWNER_INIT; + lock->owner_cpu = -1; +} + +#if 0 /* This can cause lockups */ +static void __write_lock_debug(rwlock_t *lock) +{ + u64 i; + u64 loops = loops_per_jiffy * HZ; + int print_once = 1; + + for (;;) { + for (i = 0; i < loops; i++) { + if (arch_write_trylock(&lock->raw_lock)) + return; + __delay(1); + } + /* lockup suspected: */ + if (print_once) { + print_once = 0; + printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " + "%s/%d, %p\n", + raw_smp_processor_id(), current->comm, + current->pid, lock); + dump_stack(); + } + } +} +#endif + +void do_raw_write_lock(rwlock_t *lock) +{ + debug_write_lock_before(lock); + arch_write_lock(&lock->raw_lock); + debug_write_lock_after(lock); +} + +int do_raw_write_trylock(rwlock_t *lock) +{ + int ret = arch_write_trylock(&lock->raw_lock); + + if (ret) + debug_write_lock_after(lock); +#ifndef CONFIG_SMP + /* + * Must not happen on UP: + */ + RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif + return ret; +} + +void do_raw_write_unlock(rwlock_t *lock) +{ + debug_write_unlock(lock); + arch_write_unlock(&lock->raw_lock); +} diff --git a/kernel/spinlock.c b/kernel/spinlock.c deleted file mode 100644 index 4b082b5cac9e..000000000000 --- a/kernel/spinlock.c +++ /dev/null @@ -1,399 +0,0 @@ -/* - * Copyright (2004) Linus Torvalds - * - * Author: Zwane Mwaikambo - * - * Copyright (2004, 2005) Ingo Molnar - * - * This file contains the spinlock/rwlock implementations for the - * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) - * - * Note that some architectures have special knowledge about the - * stack frames of these functions in their profile_pc. If you - * change anything significant here that could change the stack - * frame contact the architecture maintainers. - */ - -#include -#include -#include -#include -#include -#include - -/* - * If lockdep is enabled then we use the non-preemption spin-ops - * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are - * not re-enabled during lock-acquire (which the preempt-spin-ops do): - */ -#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) -/* - * The __lock_function inlines are taken from - * include/linux/spinlock_api_smp.h - */ -#else -#define raw_read_can_lock(l) read_can_lock(l) -#define raw_write_can_lock(l) write_can_lock(l) - -/* - * Some architectures can relax in favour of the CPU owning the lock. - */ -#ifndef arch_read_relax -# define arch_read_relax(l) cpu_relax() -#endif -#ifndef arch_write_relax -# define arch_write_relax(l) cpu_relax() -#endif -#ifndef arch_spin_relax -# define arch_spin_relax(l) cpu_relax() -#endif - -/* - * We build the __lock_function inlines here. They are too large for - * inlining all over the place, but here is only one user per function - * which embedds them into the calling _lock_function below. - * - * This could be a long-held lock. We both prepare to spin for a long - * time (making _this_ CPU preemptable if possible), and we also signal - * towards that other CPU that it should break the lock ASAP. - */ -#define BUILD_LOCK_OPS(op, locktype) \ -void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ -{ \ - for (;;) { \ - preempt_disable(); \ - if (likely(do_raw_##op##_trylock(lock))) \ - break; \ - preempt_enable(); \ - \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ - arch_##op##_relax(&lock->raw_lock); \ - } \ - (lock)->break_lock = 0; \ -} \ - \ -unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ -{ \ - unsigned long flags; \ - \ - for (;;) { \ - preempt_disable(); \ - local_irq_save(flags); \ - if (likely(do_raw_##op##_trylock(lock))) \ - break; \ - local_irq_restore(flags); \ - preempt_enable(); \ - \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ - arch_##op##_relax(&lock->raw_lock); \ - } \ - (lock)->break_lock = 0; \ - return flags; \ -} \ - \ -void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ -{ \ - _raw_##op##_lock_irqsave(lock); \ -} \ - \ -void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ -{ \ - unsigned long flags; \ - \ - /* */ \ - /* Careful: we must exclude softirqs too, hence the */ \ - /* irq-disabling. We use the generic preemption-aware */ \ - /* function: */ \ - /**/ \ - flags = _raw_##op##_lock_irqsave(lock); \ - local_bh_disable(); \ - local_irq_restore(flags); \ -} \ - -/* - * Build preemption-friendly versions of the following - * lock-spinning functions: - * - * __[spin|read|write]_lock() - * __[spin|read|write]_lock_irq() - * __[spin|read|write]_lock_irqsave() - * __[spin|read|write]_lock_bh() - */ -BUILD_LOCK_OPS(spin, raw_spinlock); -BUILD_LOCK_OPS(read, rwlock); -BUILD_LOCK_OPS(write, rwlock); - -#endif - -#ifndef CONFIG_INLINE_SPIN_TRYLOCK -int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) -{ - return __raw_spin_trylock(lock); -} -EXPORT_SYMBOL(_raw_spin_trylock); -#endif - -#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH -int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) -{ - return __raw_spin_trylock_bh(lock); -} -EXPORT_SYMBOL(_raw_spin_trylock_bh); -#endif - -#ifndef CONFIG_INLINE_SPIN_LOCK -void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) -{ - __raw_spin_lock(lock); -} -EXPORT_SYMBOL(_raw_spin_lock); -#endif - -#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE -unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) -{ - return __raw_spin_lock_irqsave(lock); -} -EXPORT_SYMBOL(_raw_spin_lock_irqsave); -#endif - -#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ -void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) -{ - __raw_spin_lock_irq(lock); -} -EXPORT_SYMBOL(_raw_spin_lock_irq); -#endif - -#ifndef CONFIG_INLINE_SPIN_LOCK_BH -void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) -{ - __raw_spin_lock_bh(lock); -} -EXPORT_SYMBOL(_raw_spin_lock_bh); -#endif - -#ifdef CONFIG_UNINLINE_SPIN_UNLOCK -void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) -{ - __raw_spin_unlock(lock); -} -EXPORT_SYMBOL(_raw_spin_unlock); -#endif - -#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE -void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) -{ - __raw_spin_unlock_irqrestore(lock, flags); -} -EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); -#endif - -#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ -void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) -{ - __raw_spin_unlock_irq(lock); -} -EXPORT_SYMBOL(_raw_spin_unlock_irq); -#endif - -#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH -void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) -{ - __raw_spin_unlock_bh(lock); -} -EXPORT_SYMBOL(_raw_spin_unlock_bh); -#endif - -#ifndef CONFIG_INLINE_READ_TRYLOCK -int __lockfunc _raw_read_trylock(rwlock_t *lock) -{ - return __raw_read_trylock(lock); -} -EXPORT_SYMBOL(_raw_read_trylock); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK -void __lockfunc _raw_read_lock(rwlock_t *lock) -{ - __raw_read_lock(lock); -} -EXPORT_SYMBOL(_raw_read_lock); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE -unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) -{ - return __raw_read_lock_irqsave(lock); -} -EXPORT_SYMBOL(_raw_read_lock_irqsave); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK_IRQ -void __lockfunc _raw_read_lock_irq(rwlock_t *lock) -{ - __raw_read_lock_irq(lock); -} -EXPORT_SYMBOL(_raw_read_lock_irq); -#endif - -#ifndef CONFIG_INLINE_READ_LOCK_BH -void __lockfunc _raw_read_lock_bh(rwlock_t *lock) -{ - __raw_read_lock_bh(lock); -} -EXPORT_SYMBOL(_raw_read_lock_bh); -#endif - -#ifndef CONFIG_INLINE_READ_UNLOCK -void __lockfunc _raw_read_unlock(rwlock_t *lock) -{ - __raw_read_unlock(lock); -} -EXPORT_SYMBOL(_raw_read_unlock); -#endif - -#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE -void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) -{ - __raw_read_unlock_irqrestore(lock, flags); -} -EXPORT_SYMBOL(_raw_read_unlock_irqrestore); -#endif - -#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ -void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) -{ - __raw_read_unlock_irq(lock); -} -EXPORT_SYMBOL(_raw_read_unlock_irq); -#endif - -#ifndef CONFIG_INLINE_READ_UNLOCK_BH -void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) -{ - __raw_read_unlock_bh(lock); -} -EXPORT_SYMBOL(_raw_read_unlock_bh); -#endif - -#ifndef CONFIG_INLINE_WRITE_TRYLOCK -int __lockfunc _raw_write_trylock(rwlock_t *lock) -{ - return __raw_write_trylock(lock); -} -EXPORT_SYMBOL(_raw_write_trylock); -#endif - -#ifndef CONFIG_INLINE_WRITE_LOCK -void __lockfunc _raw_write_lock(rwlock_t *lock) -{ - __raw_write_lock(lock); -} -EXPORT_SYMBOL(_raw_write_lock); -#endif - -#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE -unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) -{ - return __raw_write_lock_irqsave(lock); -} -EXPORT_SYMBOL(_raw_write_lock_irqsave); -#endif - -#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ -void __lockfunc _raw_write_lock_irq(rwlock_t *lock) -{ - __raw_write_lock_irq(lock); -} -EXPORT_SYMBOL(_raw_write_lock_irq); -#endif - -#ifndef CONFIG_INLINE_WRITE_LOCK_BH -void __lockfunc _raw_write_lock_bh(rwlock_t *lock) -{ - __raw_write_lock_bh(lock); -} -EXPORT_SYMBOL(_raw_write_lock_bh); -#endif - -#ifndef CONFIG_INLINE_WRITE_UNLOCK -void __lockfunc _raw_write_unlock(rwlock_t *lock) -{ - __raw_write_unlock(lock); -} -EXPORT_SYMBOL(_raw_write_unlock); -#endif - -#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE -void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) -{ - __raw_write_unlock_irqrestore(lock, flags); -} -EXPORT_SYMBOL(_raw_write_unlock_irqrestore); -#endif - -#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ -void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) -{ - __raw_write_unlock_irq(lock); -} -EXPORT_SYMBOL(_raw_write_unlock_irq); -#endif - -#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH -void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) -{ - __raw_write_unlock_bh(lock); -} -EXPORT_SYMBOL(_raw_write_unlock_bh); -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) -{ - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); -} -EXPORT_SYMBOL(_raw_spin_lock_nested); - -unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, - int subclass) -{ - unsigned long flags; - - local_irq_save(flags); - preempt_disable(); - spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, - do_raw_spin_lock_flags, &flags); - return flags; -} -EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); - -void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, - struct lockdep_map *nest_lock) -{ - preempt_disable(); - spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); - LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); -} -EXPORT_SYMBOL(_raw_spin_lock_nest_lock); - -#endif - -notrace int in_lock_functions(unsigned long addr) -{ - /* Linker adds these: start and end of __lockfunc functions */ - extern char __lock_text_start[], __lock_text_end[]; - - return addr >= (unsigned long)__lock_text_start - && addr < (unsigned long)__lock_text_end; -} -EXPORT_SYMBOL(in_lock_functions); diff --git a/lib/Makefile b/lib/Makefile index f3bb2cb98adf..bee27e1d3431 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_GENERIC_PCI_IOMAP) += pci_iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o diff --git a/lib/spinlock_debug.c b/lib/spinlock_debug.c deleted file mode 100644 index 0374a596cffa..000000000000 --- a/lib/spinlock_debug.c +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright 2005, Red Hat, Inc., Ingo Molnar - * Released under the General Public License (GPL). - * - * This file contains the spinlock/rwlock implementations for - * DEBUG_SPINLOCK. - */ - -#include -#include -#include -#include -#include -#include - -void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - lock->magic = SPINLOCK_MAGIC; - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -EXPORT_SYMBOL(__raw_spin_lock_init); - -void __rwlock_init(rwlock_t *lock, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif - lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; - lock->magic = RWLOCK_MAGIC; - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -EXPORT_SYMBOL(__rwlock_init); - -static void spin_dump(raw_spinlock_t *lock, const char *msg) -{ - struct task_struct *owner = NULL; - - if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) - owner = lock->owner; - printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", - msg, raw_smp_processor_id(), - current->comm, task_pid_nr(current)); - printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " - ".owner_cpu: %d\n", - lock, lock->magic, - owner ? owner->comm : "", - owner ? task_pid_nr(owner) : -1, - lock->owner_cpu); - dump_stack(); -} - -static void spin_bug(raw_spinlock_t *lock, const char *msg) -{ - if (!debug_locks_off()) - return; - - spin_dump(lock, msg); -} - -#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) - -static inline void -debug_spin_lock_before(raw_spinlock_t *lock) -{ - SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(lock->owner == current, lock, "recursion"); - SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), - lock, "cpu recursion"); -} - -static inline void debug_spin_lock_after(raw_spinlock_t *lock) -{ - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; -} - -static inline void debug_spin_unlock(raw_spinlock_t *lock) -{ - SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); - SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked"); - SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); - SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), - lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -static void __spin_lock_debug(raw_spinlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - - for (i = 0; i < loops; i++) { - if (arch_spin_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - spin_dump(lock, "lockup suspected"); -#ifdef CONFIG_SMP - trigger_all_cpu_backtrace(); -#endif - - /* - * The trylock above was causing a livelock. Give the lower level arch - * specific lock code a chance to acquire the lock. We have already - * printed a warning/backtrace at this point. The non-debug arch - * specific code might actually succeed in acquiring the lock. If it is - * not successful, the end-result is the same - there is no forward - * progress. - */ - arch_spin_lock(&lock->raw_lock); -} - -void do_raw_spin_lock(raw_spinlock_t *lock) -{ - debug_spin_lock_before(lock); - if (unlikely(!arch_spin_trylock(&lock->raw_lock))) - __spin_lock_debug(lock); - debug_spin_lock_after(lock); -} - -int do_raw_spin_trylock(raw_spinlock_t *lock) -{ - int ret = arch_spin_trylock(&lock->raw_lock); - - if (ret) - debug_spin_lock_after(lock); -#ifndef CONFIG_SMP - /* - * Must not happen on UP: - */ - SPIN_BUG_ON(!ret, lock, "trylock failure on UP"); -#endif - return ret; -} - -void do_raw_spin_unlock(raw_spinlock_t *lock) -{ - debug_spin_unlock(lock); - arch_spin_unlock(&lock->raw_lock); -} - -static void rwlock_bug(rwlock_t *lock, const char *msg) -{ - if (!debug_locks_off()) - return; - - printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", - msg, raw_smp_processor_id(), current->comm, - task_pid_nr(current), lock); - dump_stack(); -} - -#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) - -#if 0 /* __write_lock_debug() can lock up - maybe this can too? */ -static void __read_lock_debug(rwlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_read_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " - "%s/%d, %p\n", - raw_smp_processor_id(), current->comm, - current->pid, lock); - dump_stack(); - } - } -} -#endif - -void do_raw_read_lock(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - arch_read_lock(&lock->raw_lock); -} - -int do_raw_read_trylock(rwlock_t *lock) -{ - int ret = arch_read_trylock(&lock->raw_lock); - -#ifndef CONFIG_SMP - /* - * Must not happen on UP: - */ - RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); -#endif - return ret; -} - -void do_raw_read_unlock(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - arch_read_unlock(&lock->raw_lock); -} - -static inline void debug_write_lock_before(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); - RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), - lock, "cpu recursion"); -} - -static inline void debug_write_lock_after(rwlock_t *lock) -{ - lock->owner_cpu = raw_smp_processor_id(); - lock->owner = current; -} - -static inline void debug_write_unlock(rwlock_t *lock) -{ - RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); - RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), - lock, "wrong CPU"); - lock->owner = SPINLOCK_OWNER_INIT; - lock->owner_cpu = -1; -} - -#if 0 /* This can cause lockups */ -static void __write_lock_debug(rwlock_t *lock) -{ - u64 i; - u64 loops = loops_per_jiffy * HZ; - int print_once = 1; - - for (;;) { - for (i = 0; i < loops; i++) { - if (arch_write_trylock(&lock->raw_lock)) - return; - __delay(1); - } - /* lockup suspected: */ - if (print_once) { - print_once = 0; - printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " - "%s/%d, %p\n", - raw_smp_processor_id(), current->comm, - current->pid, lock); - dump_stack(); - } - } -} -#endif - -void do_raw_write_lock(rwlock_t *lock) -{ - debug_write_lock_before(lock); - arch_write_lock(&lock->raw_lock); - debug_write_lock_after(lock); -} - -int do_raw_write_trylock(rwlock_t *lock) -{ - int ret = arch_write_trylock(&lock->raw_lock); - - if (ret) - debug_write_lock_after(lock); -#ifndef CONFIG_SMP - /* - * Must not happen on UP: - */ - RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); -#endif - return ret; -} - -void do_raw_write_unlock(rwlock_t *lock) -{ - debug_write_unlock(lock); - arch_write_unlock(&lock->raw_lock); -} -- cgit v1.3 From e25a64c4017e3a3cda17454b040737e410a12991 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 18:16:43 +0100 Subject: locking: Move the semaphore core to kernel/locking/ Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-vmw5sf6vzmua1z6nx1cg69h2@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/Makefile | 2 +- kernel/locking/Makefile | 2 +- kernel/locking/semaphore.c | 263 +++++++++++++++++++++++++++++++++++++++++++++ kernel/semaphore.c | 263 --------------------------------------------- 4 files changed, 265 insertions(+), 265 deletions(-) create mode 100644 kernel/locking/semaphore.c delete mode 100644 kernel/semaphore.c (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index 4bce165dce5d..45e5ae26dc03 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ extable.o params.o posix-timers.o \ kthread.o sys_ni.o posix-cpu-timers.o \ - hrtimer.o rwsem.o nsproxy.o semaphore.o \ + hrtimer.o rwsem.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 674d2152d10f..5978fddf1412 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o +obj-y += mutex.o semaphore.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c new file mode 100644 index 000000000000..6815171a4fff --- /dev/null +++ b/kernel/locking/semaphore.c @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2008 Intel Corporation + * Author: Matthew Wilcox + * + * Distributed under the terms of the GNU GPL, version 2 + * + * This file implements counting semaphores. + * A counting semaphore may be acquired 'n' times before sleeping. + * See mutex.c for single-acquisition sleeping locks which enforce + * rules which allow code to be debugged more easily. + */ + +/* + * Some notes on the implementation: + * + * The spinlock controls access to the other members of the semaphore. + * down_trylock() and up() can be called from interrupt context, so we + * have to disable interrupts when taking the lock. It turns out various + * parts of the kernel expect to be able to use down() on a semaphore in + * interrupt context when they know it will succeed, so we have to use + * irqsave variants for down(), down_interruptible() and down_killable() + * too. + * + * The ->count variable represents how many more tasks can acquire this + * semaphore. If it's zero, there may be tasks waiting on the wait_list. + */ + +#include +#include +#include +#include +#include +#include +#include + +static noinline void __down(struct semaphore *sem); +static noinline int __down_interruptible(struct semaphore *sem); +static noinline int __down_killable(struct semaphore *sem); +static noinline int __down_timeout(struct semaphore *sem, long jiffies); +static noinline void __up(struct semaphore *sem); + +/** + * down - acquire the semaphore + * @sem: the semaphore to be acquired + * + * Acquires the semaphore. If no more tasks are allowed to acquire the + * semaphore, calling this function will put the task to sleep until the + * semaphore is released. + * + * Use of this function is deprecated, please use down_interruptible() or + * down_killable() instead. + */ +void down(struct semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + __down(sem); + raw_spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(down); + +/** + * down_interruptible - acquire the semaphore unless interrupted + * @sem: the semaphore to be acquired + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the sleep is interrupted by a signal, this function will return -EINTR. + * If the semaphore is successfully acquired, this function returns 0. + */ +int down_interruptible(struct semaphore *sem) +{ + unsigned long flags; + int result = 0; + + raw_spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_interruptible(sem); + raw_spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_interruptible); + +/** + * down_killable - acquire the semaphore unless killed + * @sem: the semaphore to be acquired + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the sleep is interrupted by a fatal signal, this function will return + * -EINTR. If the semaphore is successfully acquired, this function returns + * 0. + */ +int down_killable(struct semaphore *sem) +{ + unsigned long flags; + int result = 0; + + raw_spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_killable(sem); + raw_spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_killable); + +/** + * down_trylock - try to acquire the semaphore, without waiting + * @sem: the semaphore to be acquired + * + * Try to acquire the semaphore atomically. Returns 0 if the semaphore has + * been acquired successfully or 1 if it it cannot be acquired. + * + * NOTE: This return value is inverted from both spin_trylock and + * mutex_trylock! Be careful about this when converting code. + * + * Unlike mutex_trylock, this function can be used from interrupt context, + * and the semaphore can be released by any task or interrupt. + */ +int down_trylock(struct semaphore *sem) +{ + unsigned long flags; + int count; + + raw_spin_lock_irqsave(&sem->lock, flags); + count = sem->count - 1; + if (likely(count >= 0)) + sem->count = count; + raw_spin_unlock_irqrestore(&sem->lock, flags); + + return (count < 0); +} +EXPORT_SYMBOL(down_trylock); + +/** + * down_timeout - acquire the semaphore within a specified time + * @sem: the semaphore to be acquired + * @jiffies: how long to wait before failing + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the semaphore is not released within the specified number of jiffies, + * this function returns -ETIME. It returns 0 if the semaphore was acquired. + */ +int down_timeout(struct semaphore *sem, long jiffies) +{ + unsigned long flags; + int result = 0; + + raw_spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_timeout(sem, jiffies); + raw_spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_timeout); + +/** + * up - release the semaphore + * @sem: the semaphore to release + * + * Release the semaphore. Unlike mutexes, up() may be called from any + * context and even by tasks which have never called down(). + */ +void up(struct semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->lock, flags); + if (likely(list_empty(&sem->wait_list))) + sem->count++; + else + __up(sem); + raw_spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(up); + +/* Functions for the contended case */ + +struct semaphore_waiter { + struct list_head list; + struct task_struct *task; + bool up; +}; + +/* + * Because this function is inlined, the 'state' parameter will be + * constant, and thus optimised away by the compiler. Likewise the + * 'timeout' parameter for the cases without timeouts. + */ +static inline int __sched __down_common(struct semaphore *sem, long state, + long timeout) +{ + struct task_struct *task = current; + struct semaphore_waiter waiter; + + list_add_tail(&waiter.list, &sem->wait_list); + waiter.task = task; + waiter.up = false; + + for (;;) { + if (signal_pending_state(state, task)) + goto interrupted; + if (unlikely(timeout <= 0)) + goto timed_out; + __set_task_state(task, state); + raw_spin_unlock_irq(&sem->lock); + timeout = schedule_timeout(timeout); + raw_spin_lock_irq(&sem->lock); + if (waiter.up) + return 0; + } + + timed_out: + list_del(&waiter.list); + return -ETIME; + + interrupted: + list_del(&waiter.list); + return -EINTR; +} + +static noinline void __sched __down(struct semaphore *sem) +{ + __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_interruptible(struct semaphore *sem) +{ + return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_killable(struct semaphore *sem) +{ + return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) +{ + return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); +} + +static noinline void __sched __up(struct semaphore *sem) +{ + struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, + struct semaphore_waiter, list); + list_del(&waiter->list); + waiter->up = true; + wake_up_process(waiter->task); +} diff --git a/kernel/semaphore.c b/kernel/semaphore.c deleted file mode 100644 index 6815171a4fff..000000000000 --- a/kernel/semaphore.c +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (c) 2008 Intel Corporation - * Author: Matthew Wilcox - * - * Distributed under the terms of the GNU GPL, version 2 - * - * This file implements counting semaphores. - * A counting semaphore may be acquired 'n' times before sleeping. - * See mutex.c for single-acquisition sleeping locks which enforce - * rules which allow code to be debugged more easily. - */ - -/* - * Some notes on the implementation: - * - * The spinlock controls access to the other members of the semaphore. - * down_trylock() and up() can be called from interrupt context, so we - * have to disable interrupts when taking the lock. It turns out various - * parts of the kernel expect to be able to use down() on a semaphore in - * interrupt context when they know it will succeed, so we have to use - * irqsave variants for down(), down_interruptible() and down_killable() - * too. - * - * The ->count variable represents how many more tasks can acquire this - * semaphore. If it's zero, there may be tasks waiting on the wait_list. - */ - -#include -#include -#include -#include -#include -#include -#include - -static noinline void __down(struct semaphore *sem); -static noinline int __down_interruptible(struct semaphore *sem); -static noinline int __down_killable(struct semaphore *sem); -static noinline int __down_timeout(struct semaphore *sem, long jiffies); -static noinline void __up(struct semaphore *sem); - -/** - * down - acquire the semaphore - * @sem: the semaphore to be acquired - * - * Acquires the semaphore. If no more tasks are allowed to acquire the - * semaphore, calling this function will put the task to sleep until the - * semaphore is released. - * - * Use of this function is deprecated, please use down_interruptible() or - * down_killable() instead. - */ -void down(struct semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->lock, flags); - if (likely(sem->count > 0)) - sem->count--; - else - __down(sem); - raw_spin_unlock_irqrestore(&sem->lock, flags); -} -EXPORT_SYMBOL(down); - -/** - * down_interruptible - acquire the semaphore unless interrupted - * @sem: the semaphore to be acquired - * - * Attempts to acquire the semaphore. If no more tasks are allowed to - * acquire the semaphore, calling this function will put the task to sleep. - * If the sleep is interrupted by a signal, this function will return -EINTR. - * If the semaphore is successfully acquired, this function returns 0. - */ -int down_interruptible(struct semaphore *sem) -{ - unsigned long flags; - int result = 0; - - raw_spin_lock_irqsave(&sem->lock, flags); - if (likely(sem->count > 0)) - sem->count--; - else - result = __down_interruptible(sem); - raw_spin_unlock_irqrestore(&sem->lock, flags); - - return result; -} -EXPORT_SYMBOL(down_interruptible); - -/** - * down_killable - acquire the semaphore unless killed - * @sem: the semaphore to be acquired - * - * Attempts to acquire the semaphore. If no more tasks are allowed to - * acquire the semaphore, calling this function will put the task to sleep. - * If the sleep is interrupted by a fatal signal, this function will return - * -EINTR. If the semaphore is successfully acquired, this function returns - * 0. - */ -int down_killable(struct semaphore *sem) -{ - unsigned long flags; - int result = 0; - - raw_spin_lock_irqsave(&sem->lock, flags); - if (likely(sem->count > 0)) - sem->count--; - else - result = __down_killable(sem); - raw_spin_unlock_irqrestore(&sem->lock, flags); - - return result; -} -EXPORT_SYMBOL(down_killable); - -/** - * down_trylock - try to acquire the semaphore, without waiting - * @sem: the semaphore to be acquired - * - * Try to acquire the semaphore atomically. Returns 0 if the semaphore has - * been acquired successfully or 1 if it it cannot be acquired. - * - * NOTE: This return value is inverted from both spin_trylock and - * mutex_trylock! Be careful about this when converting code. - * - * Unlike mutex_trylock, this function can be used from interrupt context, - * and the semaphore can be released by any task or interrupt. - */ -int down_trylock(struct semaphore *sem) -{ - unsigned long flags; - int count; - - raw_spin_lock_irqsave(&sem->lock, flags); - count = sem->count - 1; - if (likely(count >= 0)) - sem->count = count; - raw_spin_unlock_irqrestore(&sem->lock, flags); - - return (count < 0); -} -EXPORT_SYMBOL(down_trylock); - -/** - * down_timeout - acquire the semaphore within a specified time - * @sem: the semaphore to be acquired - * @jiffies: how long to wait before failing - * - * Attempts to acquire the semaphore. If no more tasks are allowed to - * acquire the semaphore, calling this function will put the task to sleep. - * If the semaphore is not released within the specified number of jiffies, - * this function returns -ETIME. It returns 0 if the semaphore was acquired. - */ -int down_timeout(struct semaphore *sem, long jiffies) -{ - unsigned long flags; - int result = 0; - - raw_spin_lock_irqsave(&sem->lock, flags); - if (likely(sem->count > 0)) - sem->count--; - else - result = __down_timeout(sem, jiffies); - raw_spin_unlock_irqrestore(&sem->lock, flags); - - return result; -} -EXPORT_SYMBOL(down_timeout); - -/** - * up - release the semaphore - * @sem: the semaphore to release - * - * Release the semaphore. Unlike mutexes, up() may be called from any - * context and even by tasks which have never called down(). - */ -void up(struct semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->lock, flags); - if (likely(list_empty(&sem->wait_list))) - sem->count++; - else - __up(sem); - raw_spin_unlock_irqrestore(&sem->lock, flags); -} -EXPORT_SYMBOL(up); - -/* Functions for the contended case */ - -struct semaphore_waiter { - struct list_head list; - struct task_struct *task; - bool up; -}; - -/* - * Because this function is inlined, the 'state' parameter will be - * constant, and thus optimised away by the compiler. Likewise the - * 'timeout' parameter for the cases without timeouts. - */ -static inline int __sched __down_common(struct semaphore *sem, long state, - long timeout) -{ - struct task_struct *task = current; - struct semaphore_waiter waiter; - - list_add_tail(&waiter.list, &sem->wait_list); - waiter.task = task; - waiter.up = false; - - for (;;) { - if (signal_pending_state(state, task)) - goto interrupted; - if (unlikely(timeout <= 0)) - goto timed_out; - __set_task_state(task, state); - raw_spin_unlock_irq(&sem->lock); - timeout = schedule_timeout(timeout); - raw_spin_lock_irq(&sem->lock); - if (waiter.up) - return 0; - } - - timed_out: - list_del(&waiter.list); - return -ETIME; - - interrupted: - list_del(&waiter.list); - return -EINTR; -} - -static noinline void __sched __down(struct semaphore *sem) -{ - __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} - -static noinline int __sched __down_interruptible(struct semaphore *sem) -{ - return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); -} - -static noinline int __sched __down_killable(struct semaphore *sem) -{ - return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); -} - -static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) -{ - return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); -} - -static noinline void __sched __up(struct semaphore *sem) -{ - struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, - struct semaphore_waiter, list); - list_del(&waiter->list); - waiter->up = true; - wake_up_process(waiter->task); -} -- cgit v1.3 From 1696a8bee390929fed05c6297164816ae2ced280 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 18:18:19 +0100 Subject: locking: Move the rtmutex code to kernel/locking/ Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-p9ijt8div0hwldexwfm4nlhj@git.kernel.org [ Fixed build failure in kernel/rcu/tree_plugin.h. ] Signed-off-by: Ingo Molnar --- kernel/Makefile | 3 - kernel/futex.c | 2 +- kernel/locking/Makefile | 3 + kernel/locking/rtmutex-debug.c | 187 +++++++ kernel/locking/rtmutex-debug.h | 33 ++ kernel/locking/rtmutex-tester.c | 420 ++++++++++++++++ kernel/locking/rtmutex.c | 1060 +++++++++++++++++++++++++++++++++++++++ kernel/locking/rtmutex.h | 26 + kernel/locking/rtmutex_common.h | 126 +++++ kernel/rcu/tree_plugin.h | 2 +- kernel/rtmutex-debug.c | 187 ------- kernel/rtmutex-debug.h | 33 -- kernel/rtmutex-tester.c | 420 ---------------- kernel/rtmutex.c | 1060 --------------------------------------- kernel/rtmutex.h | 26 - kernel/rtmutex_common.h | 126 ----- 16 files changed, 1857 insertions(+), 1857 deletions(-) create mode 100644 kernel/locking/rtmutex-debug.c create mode 100644 kernel/locking/rtmutex-debug.h create mode 100644 kernel/locking/rtmutex-tester.c create mode 100644 kernel/locking/rtmutex.c create mode 100644 kernel/locking/rtmutex.h create mode 100644 kernel/locking/rtmutex_common.h delete mode 100644 kernel/rtmutex-debug.c delete mode 100644 kernel/rtmutex-debug.h delete mode 100644 kernel/rtmutex-tester.c delete mode 100644 kernel/rtmutex.c delete mode 100644 kernel/rtmutex.h delete mode 100644 kernel/rtmutex_common.h (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index 45e5ae26dc03..9c2ad1852223 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -35,9 +35,6 @@ obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) obj-$(CONFIG_FUTEX) += futex_compat.o endif -obj-$(CONFIG_RT_MUTEXES) += rtmutex.o -obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) diff --git a/kernel/futex.c b/kernel/futex.c index c3a1a55a5214..80ba086f021d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -66,7 +66,7 @@ #include -#include "rtmutex_common.h" +#include "locking/rtmutex_common.h" int __read_mostly futex_cmpxchg_enabled; diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 5978fddf1412..59f66dec2bf9 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -15,5 +15,8 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o +obj-$(CONFIG_RT_MUTEXES) += rtmutex.o +obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c new file mode 100644 index 000000000000..13b243a323fa --- /dev/null +++ b/kernel/locking/rtmutex-debug.c @@ -0,0 +1,187 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner + * + * This code is based on the rt.c implementation in the preempt-rt tree. + * Portions of said code are + * + * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Copyright (C) 2006 Esben Nielsen + * Copyright (C) 2006 Kihon Technologies Inc., + * Steven Rostedt + * + * See rt.c in preempt-rt for proper credits and further information + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtmutex_common.h" + +static void printk_task(struct task_struct *p) +{ + if (p) + printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); + else + printk(""); +} + +static void printk_lock(struct rt_mutex *lock, int print_owner) +{ + if (lock->name) + printk(" [%p] {%s}\n", + lock, lock->name); + else + printk(" [%p] {%s:%d}\n", + lock, lock->file, lock->line); + + if (print_owner && rt_mutex_owner(lock)) { + printk(".. ->owner: %p\n", lock->owner); + printk(".. held by: "); + printk_task(rt_mutex_owner(lock)); + printk("\n"); + } +} + +void rt_mutex_debug_task_free(struct task_struct *task) +{ + DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); +} + +/* + * We fill out the fields in the waiter to store the information about + * the deadlock. We print when we return. act_waiter can be NULL in + * case of a remove waiter operation. + */ +void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, + struct rt_mutex *lock) +{ + struct task_struct *task; + + if (!debug_locks || detect || !act_waiter) + return; + + task = rt_mutex_owner(act_waiter->lock); + if (task && task != current) { + act_waiter->deadlock_task_pid = get_pid(task_pid(task)); + act_waiter->deadlock_lock = lock; + } +} + +void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) +{ + struct task_struct *task; + + if (!waiter->deadlock_lock || !debug_locks) + return; + + rcu_read_lock(); + task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); + if (!task) { + rcu_read_unlock(); + return; + } + + if (!debug_locks_off()) { + rcu_read_unlock(); + return; + } + + printk("\n============================================\n"); + printk( "[ BUG: circular locking deadlock detected! ]\n"); + printk("%s\n", print_tainted()); + printk( "--------------------------------------------\n"); + printk("%s/%d is deadlocking current task %s/%d\n\n", + task->comm, task_pid_nr(task), + current->comm, task_pid_nr(current)); + + printk("\n1) %s/%d is trying to acquire this lock:\n", + current->comm, task_pid_nr(current)); + printk_lock(waiter->lock, 1); + + printk("\n2) %s/%d is blocked on this lock:\n", + task->comm, task_pid_nr(task)); + printk_lock(waiter->deadlock_lock, 1); + + debug_show_held_locks(current); + debug_show_held_locks(task); + + printk("\n%s/%d's [blocked] stackdump:\n\n", + task->comm, task_pid_nr(task)); + show_stack(task, NULL); + printk("\n%s/%d's [current] stackdump:\n\n", + current->comm, task_pid_nr(current)); + dump_stack(); + debug_show_all_locks(); + rcu_read_unlock(); + + printk("[ turning off deadlock detection." + "Please report this trace. ]\n\n"); +} + +void debug_rt_mutex_lock(struct rt_mutex *lock) +{ +} + +void debug_rt_mutex_unlock(struct rt_mutex *lock) +{ + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); +} + +void +debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) +{ +} + +void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) +{ + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); +} + +void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + memset(waiter, 0x11, sizeof(*waiter)); + plist_node_init(&waiter->list_entry, MAX_PRIO); + plist_node_init(&waiter->pi_list_entry, MAX_PRIO); + waiter->deadlock_task_pid = NULL; +} + +void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) +{ + put_pid(waiter->deadlock_task_pid); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + memset(waiter, 0x22, sizeof(*waiter)); +} + +void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) +{ + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lock->name = name; +} + +void +rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) +{ +} + +void rt_mutex_deadlock_account_unlock(struct task_struct *task) +{ +} + diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h new file mode 100644 index 000000000000..14193d596d78 --- /dev/null +++ b/kernel/locking/rtmutex-debug.h @@ -0,0 +1,33 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains macros used solely by rtmutex.c. Debug version. + */ + +extern void +rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); +extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); +extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); +extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void debug_rt_mutex_lock(struct rt_mutex *lock); +extern void debug_rt_mutex_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, + struct task_struct *powner); +extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, + struct rt_mutex *lock); +extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); +# define debug_rt_mutex_reset_waiter(w) \ + do { (w)->deadlock_lock = NULL; } while (0) + +static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, + int detect) +{ + return (waiter != NULL); +} diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c new file mode 100644 index 000000000000..1d96dd0d93c1 --- /dev/null +++ b/kernel/locking/rtmutex-tester.c @@ -0,0 +1,420 @@ +/* + * RT-Mutex-tester: scriptable tester for rt mutexes + * + * started by Thomas Gleixner: + * + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtmutex.h" + +#define MAX_RT_TEST_THREADS 8 +#define MAX_RT_TEST_MUTEXES 8 + +static spinlock_t rttest_lock; +static atomic_t rttest_event; + +struct test_thread_data { + int opcode; + int opdata; + int mutexes[MAX_RT_TEST_MUTEXES]; + int event; + struct device dev; +}; + +static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; +static struct task_struct *threads[MAX_RT_TEST_THREADS]; +static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; + +enum test_opcodes { + RTTEST_NOP = 0, + RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ + RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ + RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ + RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ + RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ + RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ + RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ + RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ + /* 9, 10 - reserved for BKL commemoration */ + RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ + RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ + RTTEST_RESET = 99, /* 99 Reset all pending operations */ +}; + +static int handle_op(struct test_thread_data *td, int lockwakeup) +{ + int i, id, ret = -EINVAL; + + switch(td->opcode) { + + case RTTEST_NOP: + return 0; + + case RTTEST_LOCKCONT: + td->mutexes[td->opdata] = 1; + td->event = atomic_add_return(1, &rttest_event); + return 0; + + case RTTEST_RESET: + for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { + if (td->mutexes[i] == 4) { + rt_mutex_unlock(&mutexes[i]); + td->mutexes[i] = 0; + } + } + return 0; + + case RTTEST_RESETEVENT: + atomic_set(&rttest_event, 0); + return 0; + + default: + if (lockwakeup) + return ret; + } + + switch(td->opcode) { + + case RTTEST_LOCK: + case RTTEST_LOCKNOWAIT: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES) + return ret; + + td->mutexes[id] = 1; + td->event = atomic_add_return(1, &rttest_event); + rt_mutex_lock(&mutexes[id]); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = 4; + return 0; + + case RTTEST_LOCKINT: + case RTTEST_LOCKINTNOWAIT: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES) + return ret; + + td->mutexes[id] = 1; + td->event = atomic_add_return(1, &rttest_event); + ret = rt_mutex_lock_interruptible(&mutexes[id], 0); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = ret ? 0 : 4; + return ret ? -EINTR : 0; + + case RTTEST_UNLOCK: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) + return ret; + + td->event = atomic_add_return(1, &rttest_event); + rt_mutex_unlock(&mutexes[id]); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = 0; + return 0; + + default: + break; + } + return ret; +} + +/* + * Schedule replacement for rtsem_down(). Only called for threads with + * PF_MUTEX_TESTER set. + * + * This allows us to have finegrained control over the event flow. + * + */ +void schedule_rt_mutex_test(struct rt_mutex *mutex) +{ + int tid, op, dat; + struct test_thread_data *td; + + /* We have to lookup the task */ + for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { + if (threads[tid] == current) + break; + } + + BUG_ON(tid == MAX_RT_TEST_THREADS); + + td = &thread_data[tid]; + + op = td->opcode; + dat = td->opdata; + + switch (op) { + case RTTEST_LOCK: + case RTTEST_LOCKINT: + case RTTEST_LOCKNOWAIT: + case RTTEST_LOCKINTNOWAIT: + if (mutex != &mutexes[dat]) + break; + + if (td->mutexes[dat] != 1) + break; + + td->mutexes[dat] = 2; + td->event = atomic_add_return(1, &rttest_event); + break; + + default: + break; + } + + schedule(); + + + switch (op) { + case RTTEST_LOCK: + case RTTEST_LOCKINT: + if (mutex != &mutexes[dat]) + return; + + if (td->mutexes[dat] != 2) + return; + + td->mutexes[dat] = 3; + td->event = atomic_add_return(1, &rttest_event); + break; + + case RTTEST_LOCKNOWAIT: + case RTTEST_LOCKINTNOWAIT: + if (mutex != &mutexes[dat]) + return; + + if (td->mutexes[dat] != 2) + return; + + td->mutexes[dat] = 1; + td->event = atomic_add_return(1, &rttest_event); + return; + + default: + return; + } + + td->opcode = 0; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if (td->opcode > 0) { + int ret; + + set_current_state(TASK_RUNNING); + ret = handle_op(td, 1); + set_current_state(TASK_INTERRUPTIBLE); + if (td->opcode == RTTEST_LOCKCONT) + break; + td->opcode = ret; + } + + /* Wait for the next command to be executed */ + schedule(); + } + + /* Restore previous command and data */ + td->opcode = op; + td->opdata = dat; +} + +static int test_func(void *data) +{ + struct test_thread_data *td = data; + int ret; + + current->flags |= PF_MUTEX_TESTER; + set_freezable(); + allow_signal(SIGHUP); + + for(;;) { + + set_current_state(TASK_INTERRUPTIBLE); + + if (td->opcode > 0) { + set_current_state(TASK_RUNNING); + ret = handle_op(td, 0); + set_current_state(TASK_INTERRUPTIBLE); + td->opcode = ret; + } + + /* Wait for the next command to be executed */ + schedule(); + try_to_freeze(); + + if (signal_pending(current)) + flush_signals(current); + + if(kthread_should_stop()) + break; + } + return 0; +} + +/** + * sysfs_test_command - interface for test commands + * @dev: thread reference + * @buf: command for actual step + * @count: length of buffer + * + * command syntax: + * + * opcode:data + */ +static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct sched_param schedpar; + struct test_thread_data *td; + char cmdbuf[32]; + int op, dat, tid, ret; + + td = container_of(dev, struct test_thread_data, dev); + tid = td->dev.id; + + /* strings from sysfs write are not 0 terminated! */ + if (count >= sizeof(cmdbuf)) + return -EINVAL; + + /* strip of \n: */ + if (buf[count-1] == '\n') + count--; + if (count < 1) + return -EINVAL; + + memcpy(cmdbuf, buf, count); + cmdbuf[count] = 0; + + if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) + return -EINVAL; + + switch (op) { + case RTTEST_SCHEDOT: + schedpar.sched_priority = 0; + ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); + if (ret) + return ret; + set_user_nice(current, 0); + break; + + case RTTEST_SCHEDRT: + schedpar.sched_priority = dat; + ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); + if (ret) + return ret; + break; + + case RTTEST_SIGNAL: + send_sig(SIGHUP, threads[tid], 0); + break; + + default: + if (td->opcode > 0) + return -EBUSY; + td->opdata = dat; + td->opcode = op; + wake_up_process(threads[tid]); + } + + return count; +} + +/** + * sysfs_test_status - sysfs interface for rt tester + * @dev: thread to query + * @buf: char buffer to be filled with thread status info + */ +static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct test_thread_data *td; + struct task_struct *tsk; + char *curr = buf; + int i; + + td = container_of(dev, struct test_thread_data, dev); + tsk = threads[td->dev.id]; + + spin_lock(&rttest_lock); + + curr += sprintf(curr, + "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", + td->opcode, td->event, tsk->state, + (MAX_RT_PRIO - 1) - tsk->prio, + (MAX_RT_PRIO - 1) - tsk->normal_prio, + tsk->pi_blocked_on); + + for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) + curr += sprintf(curr, "%d", td->mutexes[i]); + + spin_unlock(&rttest_lock); + + curr += sprintf(curr, ", T: %p, R: %p\n", tsk, + mutexes[td->dev.id].owner); + + return curr - buf; +} + +static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); +static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); + +static struct bus_type rttest_subsys = { + .name = "rttest", + .dev_name = "rttest", +}; + +static int init_test_thread(int id) +{ + thread_data[id].dev.bus = &rttest_subsys; + thread_data[id].dev.id = id; + + threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); + if (IS_ERR(threads[id])) + return PTR_ERR(threads[id]); + + return device_register(&thread_data[id].dev); +} + +static int init_rttest(void) +{ + int ret, i; + + spin_lock_init(&rttest_lock); + + for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) + rt_mutex_init(&mutexes[i]); + + ret = subsys_system_register(&rttest_subsys, NULL); + if (ret) + return ret; + + for (i = 0; i < MAX_RT_TEST_THREADS; i++) { + ret = init_test_thread(i); + if (ret) + break; + ret = device_create_file(&thread_data[i].dev, &dev_attr_status); + if (ret) + break; + ret = device_create_file(&thread_data[i].dev, &dev_attr_command); + if (ret) + break; + } + + printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); + + return ret; +} + +device_initcall(init_rttest); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c new file mode 100644 index 000000000000..0dd6aec1cb6a --- /dev/null +++ b/kernel/locking/rtmutex.c @@ -0,0 +1,1060 @@ +/* + * RT-Mutexes: simple blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner. + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner + * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt + * Copyright (C) 2006 Esben Nielsen + * + * See Documentation/rt-mutex-design.txt for details. + */ +#include +#include +#include +#include +#include + +#include "rtmutex_common.h" + +/* + * lock->owner state tracking: + * + * lock->owner holds the task_struct pointer of the owner. Bit 0 + * is used to keep track of the "lock has waiters" state. + * + * owner bit0 + * NULL 0 lock is free (fast acquire possible) + * NULL 1 lock is free and has waiters and the top waiter + * is going to take the lock* + * taskpointer 0 lock is held (fast release possible) + * taskpointer 1 lock is held and has waiters** + * + * The fast atomic compare exchange based acquire and release is only + * possible when bit 0 of lock->owner is 0. + * + * (*) It also can be a transitional state when grabbing the lock + * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, + * we need to set the bit0 before looking at the lock, and the owner may be + * NULL in this small time, hence this can be a transitional state. + * + * (**) There is a small time when bit 0 is set but there are no + * waiters. This can happen when grabbing the lock in the slow path. + * To prevent a cmpxchg of the owner releasing the lock, we need to + * set this bit before looking at the lock. + */ + +static void +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) +{ + unsigned long val = (unsigned long)owner; + + if (rt_mutex_has_waiters(lock)) + val |= RT_MUTEX_HAS_WAITERS; + + lock->owner = (struct task_struct *)val; +} + +static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static void fixup_rt_mutex_waiters(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + clear_rt_mutex_waiters(lock); +} + +/* + * We can speed up the acquire/release, if the architecture + * supports cmpxchg and if there's no debugging state to be set up + */ +#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + unsigned long owner, *p = (unsigned long *) &lock->owner; + + do { + owner = *p; + } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); +} +#else +# define rt_mutex_cmpxchg(l,c,n) (0) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); +} +#endif + +/* + * Calculate task priority from the waiter list priority + * + * Return task->normal_prio when the waiter list is empty or when + * the waiter is not allowed to do priority boosting + */ +int rt_mutex_getprio(struct task_struct *task) +{ + if (likely(!task_has_pi_waiters(task))) + return task->normal_prio; + + return min(task_top_pi_waiter(task)->pi_list_entry.prio, + task->normal_prio); +} + +/* + * Adjust the priority of a task, after its pi_waiters got modified. + * + * This can be both boosting and unboosting. task->pi_lock must be held. + */ +static void __rt_mutex_adjust_prio(struct task_struct *task) +{ + int prio = rt_mutex_getprio(task); + + if (task->prio != prio) + rt_mutex_setprio(task, prio); +} + +/* + * Adjust task priority (undo boosting). Called from the exit path of + * rt_mutex_slowunlock() and rt_mutex_slowlock(). + * + * (Note: We do this outside of the protection of lock->wait_lock to + * allow the lock to be taken while or before we readjust the priority + * of task. We do not use the spin_xx_mutex() variants here as we are + * outside of the debug path.) + */ +static void rt_mutex_adjust_prio(struct task_struct *task) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); +} + +/* + * Max number of times we'll walk the boosting chain: + */ +int max_lock_depth = 1024; + +/* + * Adjust the priority chain. Also used for deadlock detection. + * Decreases task's usage by one - may thus free the task. + * + * @task: the task owning the mutex (owner) for which a chain walk is probably + * needed + * @deadlock_detect: do we have to carry out deadlock detection? + * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck + * things for a task that has just got its priority adjusted, and + * is waiting on a mutex) + * @orig_waiter: rt_mutex_waiter struct for the task that has just donated + * its priority to the mutex owner (can be NULL in the case + * depicted above or if the top waiter is gone away and we are + * actually deboosting the owner) + * @top_task: the current top waiter + * + * Returns 0 or -EDEADLK. + */ +static int rt_mutex_adjust_prio_chain(struct task_struct *task, + int deadlock_detect, + struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task) +{ + struct rt_mutex *lock; + struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; + int detect_deadlock, ret = 0, depth = 0; + unsigned long flags; + + detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, + deadlock_detect); + + /* + * The (de)boosting is a step by step approach with a lot of + * pitfalls. We want this to be preemptible and we want hold a + * maximum of two locks per step. So we have to check + * carefully whether things change under us. + */ + again: + if (++depth > max_lock_depth) { + static int prev_max; + + /* + * Print this only once. If the admin changes the limit, + * print a new message when reaching the limit again. + */ + if (prev_max != max_lock_depth) { + prev_max = max_lock_depth; + printk(KERN_WARNING "Maximum lock depth %d reached " + "task: %s (%d)\n", max_lock_depth, + top_task->comm, task_pid_nr(top_task)); + } + put_task_struct(task); + + return deadlock_detect ? -EDEADLK : 0; + } + retry: + /* + * Task can not go away as we did a get_task() before ! + */ + raw_spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + /* + * Check whether the end of the boosting chain has been + * reached or the state of the chain has changed while we + * dropped the locks. + */ + if (!waiter) + goto out_unlock_pi; + + /* + * Check the orig_waiter state. After we dropped the locks, + * the previous owner of the lock might have released the lock. + */ + if (orig_waiter && !rt_mutex_owner(orig_lock)) + goto out_unlock_pi; + + /* + * Drop out, when the task has no waiters. Note, + * top_waiter can be NULL, when we are in the deboosting + * mode! + */ + if (top_waiter && (!task_has_pi_waiters(task) || + top_waiter != task_top_pi_waiter(task))) + goto out_unlock_pi; + + /* + * When deadlock detection is off then we check, if further + * priority adjustment is necessary. + */ + if (!detect_deadlock && waiter->list_entry.prio == task->prio) + goto out_unlock_pi; + + lock = waiter->lock; + if (!raw_spin_trylock(&lock->wait_lock)) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + cpu_relax(); + goto retry; + } + + /* Deadlock detection */ + if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { + debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); + raw_spin_unlock(&lock->wait_lock); + ret = deadlock_detect ? -EDEADLK : 0; + goto out_unlock_pi; + } + + top_waiter = rt_mutex_top_waiter(lock); + + /* Requeue the waiter */ + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->list_entry.prio = task->prio; + plist_add(&waiter->list_entry, &lock->wait_list); + + /* Release the task */ + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + if (!rt_mutex_owner(lock)) { + /* + * If the requeue above changed the top waiter, then we need + * to wake the new top waiter up to try to get the lock. + */ + + if (top_waiter != rt_mutex_top_waiter(lock)) + wake_up_process(rt_mutex_top_waiter(lock)->task); + raw_spin_unlock(&lock->wait_lock); + goto out_put_task; + } + put_task_struct(task); + + /* Grab the next task */ + task = rt_mutex_owner(lock); + get_task_struct(task); + raw_spin_lock_irqsave(&task->pi_lock, flags); + + if (waiter == rt_mutex_top_waiter(lock)) { + /* Boost the owner */ + plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + + } else if (top_waiter == waiter) { + /* Deboost the owner */ + plist_del(&waiter->pi_list_entry, &task->pi_waiters); + waiter = rt_mutex_top_waiter(lock); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + } + + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + top_waiter = rt_mutex_top_waiter(lock); + raw_spin_unlock(&lock->wait_lock); + + if (!detect_deadlock && waiter != top_waiter) + goto out_put_task; + + goto again; + + out_unlock_pi: + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + out_put_task: + put_task_struct(task); + + return ret; +} + +/* + * Try to take an rt-mutex + * + * Must be called with lock->wait_lock held. + * + * @lock: the lock to be acquired. + * @task: the task which wants to acquire the lock + * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) + */ +static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) +{ + /* + * We have to be careful here if the atomic speedups are + * enabled, such that, when + * - no other waiter is on the lock + * - the lock has been released since we did the cmpxchg + * the lock can be released or taken while we are doing the + * checks and marking the lock with RT_MUTEX_HAS_WAITERS. + * + * The atomic acquire/release aware variant of + * mark_rt_mutex_waiters uses a cmpxchg loop. After setting + * the WAITERS bit, the atomic release / acquire can not + * happen anymore and lock->wait_lock protects us from the + * non-atomic case. + * + * Note, that this might set lock->owner = + * RT_MUTEX_HAS_WAITERS in the case the lock is not contended + * any more. This is fixed up when we take the ownership. + * This is the transitional state explained at the top of this file. + */ + mark_rt_mutex_waiters(lock); + + if (rt_mutex_owner(lock)) + return 0; + + /* + * It will get the lock because of one of these conditions: + * 1) there is no waiter + * 2) higher priority than waiters + * 3) it is top waiter + */ + if (rt_mutex_has_waiters(lock)) { + if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { + if (!waiter || waiter != rt_mutex_top_waiter(lock)) + return 0; + } + } + + if (waiter || rt_mutex_has_waiters(lock)) { + unsigned long flags; + struct rt_mutex_waiter *top; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + /* remove the queued waiter. */ + if (waiter) { + plist_del(&waiter->list_entry, &lock->wait_list); + task->pi_blocked_on = NULL; + } + + /* + * We have to enqueue the top waiter(if it exists) into + * task->pi_waiters list. + */ + if (rt_mutex_has_waiters(lock)) { + top = rt_mutex_top_waiter(lock); + top->pi_list_entry.prio = top->list_entry.prio; + plist_add(&top->pi_list_entry, &task->pi_waiters); + } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + } + + /* We got the lock. */ + debug_rt_mutex_lock(lock); + + rt_mutex_set_owner(lock, task); + + rt_mutex_deadlock_account_lock(lock, task); + + return 1; +} + +/* + * Task blocks on lock. + * + * Prepare waiter and propagate pi chain + * + * This must be called with lock->wait_lock held. + */ +static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + int detect_deadlock) +{ + struct task_struct *owner = rt_mutex_owner(lock); + struct rt_mutex_waiter *top_waiter = waiter; + unsigned long flags; + int chain_walk = 0, res; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + waiter->task = task; + waiter->lock = lock; + plist_node_init(&waiter->list_entry, task->prio); + plist_node_init(&waiter->pi_list_entry, task->prio); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&waiter->list_entry, &lock->wait_list); + + task->pi_blocked_on = waiter; + + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + if (!owner) + return 0; + + if (waiter == rt_mutex_top_waiter(lock)) { + raw_spin_lock_irqsave(&owner->pi_lock, flags); + plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); + plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + + __rt_mutex_adjust_prio(owner); + if (owner->pi_blocked_on) + chain_walk = 1; + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + } + else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) + chain_walk = 1; + + if (!chain_walk) + return 0; + + /* + * The owner can't disappear while holding a lock, + * so the owner struct is protected by wait_lock. + * Gets dropped in rt_mutex_adjust_prio_chain()! + */ + get_task_struct(owner); + + raw_spin_unlock(&lock->wait_lock); + + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, + task); + + raw_spin_lock(&lock->wait_lock); + + return res; +} + +/* + * Wake up the next waiter on the lock. + * + * Remove the top waiter from the current tasks waiter list and wake it up. + * + * Called with lock->wait_lock held. + */ +static void wakeup_next_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *waiter; + unsigned long flags; + + raw_spin_lock_irqsave(¤t->pi_lock, flags); + + waiter = rt_mutex_top_waiter(lock); + + /* + * Remove it from current->pi_waiters. We do not adjust a + * possible priority boost right now. We execute wakeup in the + * boosted mode and go back to normal after releasing + * lock->wait_lock. + */ + plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + + rt_mutex_set_owner(lock, NULL); + + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + + wake_up_process(waiter->task); +} + +/* + * Remove a waiter from a lock and give up + * + * Must be called with lock->wait_lock held and + * have just failed to try_to_take_rt_mutex(). + */ +static void remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter) +{ + int first = (waiter == rt_mutex_top_waiter(lock)); + struct task_struct *owner = rt_mutex_owner(lock); + unsigned long flags; + int chain_walk = 0; + + raw_spin_lock_irqsave(¤t->pi_lock, flags); + plist_del(&waiter->list_entry, &lock->wait_list); + current->pi_blocked_on = NULL; + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + + if (!owner) + return; + + if (first) { + + raw_spin_lock_irqsave(&owner->pi_lock, flags); + + plist_del(&waiter->pi_list_entry, &owner->pi_waiters); + + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &owner->pi_waiters); + } + __rt_mutex_adjust_prio(owner); + + if (owner->pi_blocked_on) + chain_walk = 1; + + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + } + + WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + + if (!chain_walk) + return; + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + + raw_spin_unlock(&lock->wait_lock); + + rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); + + raw_spin_lock(&lock->wait_lock); +} + +/* + * Recheck the pi chain, in case we got a priority setting + * + * Called from sched_setscheduler + */ +void rt_mutex_adjust_pi(struct task_struct *task) +{ + struct rt_mutex_waiter *waiter; + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + if (!waiter || waiter->list_entry.prio == task->prio) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return; + } + + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); + rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); +} + +/** + * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take + * @state: the state the task should block in (TASK_INTERRUPTIBLE + * or TASK_UNINTERRUPTIBLE) + * @timeout: the pre-initialized and started timer, or NULL for none + * @waiter: the pre-initialized rt_mutex_waiter + * + * lock->wait_lock must be held by the caller. + */ +static int __sched +__rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter) +{ + int ret = 0; + + for (;;) { + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock, current, waiter)) + break; + + /* + * TASK_INTERRUPTIBLE checks for signals and + * timeout. Ignored otherwise. + */ + if (unlikely(state == TASK_INTERRUPTIBLE)) { + /* Signal pending? */ + if (signal_pending(current)) + ret = -EINTR; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + if (ret) + break; + } + + raw_spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(waiter); + + schedule_rt_mutex(lock); + + raw_spin_lock(&lock->wait_lock); + set_current_state(state); + } + + return ret; +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + struct rt_mutex_waiter waiter; + int ret = 0; + + debug_rt_mutex_init_waiter(&waiter); + + raw_spin_lock(&lock->wait_lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock, current, NULL)) { + raw_spin_unlock(&lock->wait_lock); + return 0; + } + + set_current_state(state); + + /* Setup the timer, when timeout != NULL */ + if (unlikely(timeout)) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); + + if (likely(!ret)) + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); + + set_current_state(TASK_RUNNING); + + if (unlikely(ret)) + remove_waiter(lock, &waiter); + + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + raw_spin_unlock(&lock->wait_lock); + + /* Remove pending timer: */ + if (unlikely(timeout)) + hrtimer_cancel(&timeout->timer); + + debug_rt_mutex_free_waiter(&waiter); + + return ret; +} + +/* + * Slow path try-lock function: + */ +static inline int +rt_mutex_slowtrylock(struct rt_mutex *lock) +{ + int ret = 0; + + raw_spin_lock(&lock->wait_lock); + + if (likely(rt_mutex_owner(lock) != current)) { + + ret = try_to_take_rt_mutex(lock, current, NULL); + /* + * try_to_take_rt_mutex() sets the lock waiters + * bit unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + } + + raw_spin_unlock(&lock->wait_lock); + + return ret; +} + +/* + * Slow path to release a rt-mutex: + */ +static void __sched +rt_mutex_slowunlock(struct rt_mutex *lock) +{ + raw_spin_lock(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + raw_spin_unlock(&lock->wait_lock); + return; + } + + wakeup_next_waiter(lock); + + raw_spin_unlock(&lock->wait_lock); + + /* Undo pi boosting if necessary: */ + rt_mutex_adjust_prio(current); +} + +/* + * debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static inline int +rt_mutex_fastlock(struct rt_mutex *lock, int state, + int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, NULL, detect_deadlock); +} + +static inline int +rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, timeout, detect_deadlock); +} + +static inline int +rt_mutex_fasttrylock(struct rt_mutex *lock, + int (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 1; + } + return slowfn(lock); +} + +static inline void +rt_mutex_fastunlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(lock); +} + +/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + */ +void __sched rt_mutex_lock(struct rt_mutex *lock) +{ + might_sleep(); + + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock); + +/** + * rt_mutex_lock_interruptible - lock a rt_mutex interruptible + * + * @lock: the rt_mutex to be locked + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +/** + * rt_mutex_timed_lock - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller + * + * @lock: the rt_mutex to be locked + * @timeout: timeout structure or NULL (no timeout) + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -ETIMEDOUT when the timeout expired + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int +rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); + +/** + * rt_mutex_trylock - try to lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * + * Returns 1 on success and 0 on contention + */ +int __sched rt_mutex_trylock(struct rt_mutex *lock) +{ + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); +} +EXPORT_SYMBOL_GPL(rt_mutex_trylock); + +/** + * rt_mutex_unlock - unlock a rt_mutex + * + * @lock: the rt_mutex to be unlocked + */ +void __sched rt_mutex_unlock(struct rt_mutex *lock) +{ + rt_mutex_fastunlock(lock, rt_mutex_slowunlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +/** + * rt_mutex_destroy - mark a mutex unusable + * @lock: the mutex to be destroyed + * + * This function marks the mutex uninitialized, and any subsequent + * use of the mutex is forbidden. The mutex must not be locked when + * this function is called. + */ +void rt_mutex_destroy(struct rt_mutex *lock) +{ + WARN_ON(rt_mutex_is_locked(lock)); +#ifdef CONFIG_DEBUG_RT_MUTEXES + lock->magic = NULL; +#endif +} + +EXPORT_SYMBOL_GPL(rt_mutex_destroy); + +/** + * __rt_mutex_init - initialize the rt lock + * + * @lock: the rt lock to be initialized + * + * Initialize the rt lock to unlocked state. + * + * Initializing of a locked rt lock is not allowed + */ +void __rt_mutex_init(struct rt_mutex *lock, const char *name) +{ + lock->owner = NULL; + raw_spin_lock_init(&lock->wait_lock); + plist_head_init(&lock->wait_list); + + debug_rt_mutex_init(lock, name); +} +EXPORT_SYMBOL_GPL(__rt_mutex_init); + +/** + * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a + * proxy owner + * + * @lock: the rt_mutex to be locked + * @proxy_owner:the task to set as owner + * + * No locking. Caller has to do serializing itself + * Special API call for PI-futex support + */ +void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner) +{ + __rt_mutex_init(lock, NULL); + debug_rt_mutex_proxy_lock(lock, proxy_owner); + rt_mutex_set_owner(lock, proxy_owner); + rt_mutex_deadlock_account_lock(lock, proxy_owner); +} + +/** + * rt_mutex_proxy_unlock - release a lock on behalf of owner + * + * @lock: the rt_mutex to be locked + * + * No locking. Caller has to do serializing itself + * Special API call for PI-futex support + */ +void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner) +{ + debug_rt_mutex_proxy_unlock(lock); + rt_mutex_set_owner(lock, NULL); + rt_mutex_deadlock_account_unlock(proxy_owner); +} + +/** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) +{ + int ret; + + raw_spin_lock(&lock->wait_lock); + + if (try_to_take_rt_mutex(lock, task, NULL)) { + raw_spin_unlock(&lock->wait_lock); + return 1; + } + + ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + + if (ret && !rt_mutex_owner(lock)) { + /* + * Reset the return value. We might have + * returned with -EDEADLK and the owner + * released the lock while we were walking the + * pi chain. Let the waiter sort it out. + */ + ret = 0; + } + + if (unlikely(ret)) + remove_waiter(lock, waiter); + + raw_spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(waiter); + + return ret; +} + +/** + * rt_mutex_next_owner - return the next owner of the lock + * + * @lock: the rt lock query + * + * Returns the next owner of the lock or NULL + * + * Caller has to serialize against other accessors to the lock + * itself. + * + * Special API call for PI-futex support + */ +struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + return NULL; + + return rt_mutex_top_waiter(lock)->task; +} + +/** + * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Complete the lock acquisition started our behalf by another thread. + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * + * Special API call for PI-futex requeue support + */ +int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock) +{ + int ret; + + raw_spin_lock(&lock->wait_lock); + + set_current_state(TASK_INTERRUPTIBLE); + + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); + + set_current_state(TASK_RUNNING); + + if (unlikely(ret)) + remove_waiter(lock, waiter); + + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + raw_spin_unlock(&lock->wait_lock); + + return ret; +} diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h new file mode 100644 index 000000000000..a1a1dd06421d --- /dev/null +++ b/kernel/locking/rtmutex.h @@ -0,0 +1,26 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains macros used solely by rtmutex.c. + * Non-debug version. + */ + +#define rt_mutex_deadlock_check(l) (0) +#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) +#define rt_mutex_deadlock_account_unlock(l) do { } while (0) +#define debug_rt_mutex_init_waiter(w) do { } while (0) +#define debug_rt_mutex_free_waiter(w) do { } while (0) +#define debug_rt_mutex_lock(l) do { } while (0) +#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) +#define debug_rt_mutex_proxy_unlock(l) do { } while (0) +#define debug_rt_mutex_unlock(l) do { } while (0) +#define debug_rt_mutex_init(m, n) do { } while (0) +#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) +#define debug_rt_mutex_print_deadlock(w) do { } while (0) +#define debug_rt_mutex_detect_deadlock(w,d) (d) +#define debug_rt_mutex_reset_waiter(w) do { } while (0) diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h new file mode 100644 index 000000000000..53a66c85261b --- /dev/null +++ b/kernel/locking/rtmutex_common.h @@ -0,0 +1,126 @@ +/* + * RT Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains the private data structure and API definitions. + */ + +#ifndef __KERNEL_RTMUTEX_COMMON_H +#define __KERNEL_RTMUTEX_COMMON_H + +#include + +/* + * The rtmutex in kernel tester is independent of rtmutex debugging. We + * call schedule_rt_mutex_test() instead of schedule() for the tasks which + * belong to the tester. That way we can delay the wakeup path of those + * threads to provoke lock stealing and testing of complex boosting scenarios. + */ +#ifdef CONFIG_RT_MUTEX_TESTER + +extern void schedule_rt_mutex_test(struct rt_mutex *lock); + +#define schedule_rt_mutex(_lock) \ + do { \ + if (!(current->flags & PF_MUTEX_TESTER)) \ + schedule(); \ + else \ + schedule_rt_mutex_test(_lock); \ + } while (0) + +#else +# define schedule_rt_mutex(_lock) schedule() +#endif + +/* + * This is the control structure for tasks blocked on a rt_mutex, + * which is allocated on the kernel stack on of the blocked task. + * + * @list_entry: pi node to enqueue into the mutex waiters list + * @pi_list_entry: pi node to enqueue into the mutex owner waiters list + * @task: task reference to the blocked task + */ +struct rt_mutex_waiter { + struct plist_node list_entry; + struct plist_node pi_list_entry; + struct task_struct *task; + struct rt_mutex *lock; +#ifdef CONFIG_DEBUG_RT_MUTEXES + unsigned long ip; + struct pid *deadlock_task_pid; + struct rt_mutex *deadlock_lock; +#endif +}; + +/* + * Various helpers to access the waiters-plist: + */ +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ + return !plist_head_empty(&lock->wait_list); +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *w; + + w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, + list_entry); + BUG_ON(w->lock != lock); + + return w; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ + return !plist_head_empty(&p->pi_waiters); +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ + return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, + pi_list_entry); +} + +/* + * lock->owner state tracking: + */ +#define RT_MUTEX_HAS_WAITERS 1UL +#define RT_MUTEX_OWNER_MASKALL 1UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); +} + +/* + * PI-futex support (proxy locking functions, etc.): + */ +extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); +extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner); +extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + int detect_deadlock); +extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock); + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + +#endif diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3822ac0c4b27..6abb03dff5c0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1133,7 +1133,7 @@ void exit_rcu(void) #ifdef CONFIG_RCU_BOOST -#include "../rtmutex_common.h" +#include "../locking/rtmutex_common.h" #ifdef CONFIG_RCU_TRACE diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c deleted file mode 100644 index 13b243a323fa..000000000000 --- a/kernel/rtmutex-debug.c +++ /dev/null @@ -1,187 +0,0 @@ -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2006 Timesys Corp., Thomas Gleixner - * - * This code is based on the rt.c implementation in the preempt-rt tree. - * Portions of said code are - * - * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey - * Copyright (C) 2006 Esben Nielsen - * Copyright (C) 2006 Kihon Technologies Inc., - * Steven Rostedt - * - * See rt.c in preempt-rt for proper credits and further information - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rtmutex_common.h" - -static void printk_task(struct task_struct *p) -{ - if (p) - printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); - else - printk(""); -} - -static void printk_lock(struct rt_mutex *lock, int print_owner) -{ - if (lock->name) - printk(" [%p] {%s}\n", - lock, lock->name); - else - printk(" [%p] {%s:%d}\n", - lock, lock->file, lock->line); - - if (print_owner && rt_mutex_owner(lock)) { - printk(".. ->owner: %p\n", lock->owner); - printk(".. held by: "); - printk_task(rt_mutex_owner(lock)); - printk("\n"); - } -} - -void rt_mutex_debug_task_free(struct task_struct *task) -{ - DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); - DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); -} - -/* - * We fill out the fields in the waiter to store the information about - * the deadlock. We print when we return. act_waiter can be NULL in - * case of a remove waiter operation. - */ -void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, - struct rt_mutex *lock) -{ - struct task_struct *task; - - if (!debug_locks || detect || !act_waiter) - return; - - task = rt_mutex_owner(act_waiter->lock); - if (task && task != current) { - act_waiter->deadlock_task_pid = get_pid(task_pid(task)); - act_waiter->deadlock_lock = lock; - } -} - -void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) -{ - struct task_struct *task; - - if (!waiter->deadlock_lock || !debug_locks) - return; - - rcu_read_lock(); - task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); - if (!task) { - rcu_read_unlock(); - return; - } - - if (!debug_locks_off()) { - rcu_read_unlock(); - return; - } - - printk("\n============================================\n"); - printk( "[ BUG: circular locking deadlock detected! ]\n"); - printk("%s\n", print_tainted()); - printk( "--------------------------------------------\n"); - printk("%s/%d is deadlocking current task %s/%d\n\n", - task->comm, task_pid_nr(task), - current->comm, task_pid_nr(current)); - - printk("\n1) %s/%d is trying to acquire this lock:\n", - current->comm, task_pid_nr(current)); - printk_lock(waiter->lock, 1); - - printk("\n2) %s/%d is blocked on this lock:\n", - task->comm, task_pid_nr(task)); - printk_lock(waiter->deadlock_lock, 1); - - debug_show_held_locks(current); - debug_show_held_locks(task); - - printk("\n%s/%d's [blocked] stackdump:\n\n", - task->comm, task_pid_nr(task)); - show_stack(task, NULL); - printk("\n%s/%d's [current] stackdump:\n\n", - current->comm, task_pid_nr(current)); - dump_stack(); - debug_show_all_locks(); - rcu_read_unlock(); - - printk("[ turning off deadlock detection." - "Please report this trace. ]\n\n"); -} - -void debug_rt_mutex_lock(struct rt_mutex *lock) -{ -} - -void debug_rt_mutex_unlock(struct rt_mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); -} - -void -debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) -{ -} - -void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) -{ - DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); -} - -void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) -{ - memset(waiter, 0x11, sizeof(*waiter)); - plist_node_init(&waiter->list_entry, MAX_PRIO); - plist_node_init(&waiter->pi_list_entry, MAX_PRIO); - waiter->deadlock_task_pid = NULL; -} - -void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) -{ - put_pid(waiter->deadlock_task_pid); - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - memset(waiter, 0x22, sizeof(*waiter)); -} - -void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) -{ - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lock->name = name; -} - -void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) -{ -} - -void rt_mutex_deadlock_account_unlock(struct task_struct *task) -{ -} - diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h deleted file mode 100644 index 14193d596d78..000000000000 --- a/kernel/rtmutex-debug.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner - * - * This file contains macros used solely by rtmutex.c. Debug version. - */ - -extern void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); -extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); -extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); -extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); -extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); -extern void debug_rt_mutex_lock(struct rt_mutex *lock); -extern void debug_rt_mutex_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, - struct task_struct *powner); -extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, - struct rt_mutex *lock); -extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); -# define debug_rt_mutex_reset_waiter(w) \ - do { (w)->deadlock_lock = NULL; } while (0) - -static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, - int detect) -{ - return (waiter != NULL); -} diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c deleted file mode 100644 index 1d96dd0d93c1..000000000000 --- a/kernel/rtmutex-tester.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - * RT-Mutex-tester: scriptable tester for rt mutexes - * - * started by Thomas Gleixner: - * - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rtmutex.h" - -#define MAX_RT_TEST_THREADS 8 -#define MAX_RT_TEST_MUTEXES 8 - -static spinlock_t rttest_lock; -static atomic_t rttest_event; - -struct test_thread_data { - int opcode; - int opdata; - int mutexes[MAX_RT_TEST_MUTEXES]; - int event; - struct device dev; -}; - -static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; -static struct task_struct *threads[MAX_RT_TEST_THREADS]; -static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; - -enum test_opcodes { - RTTEST_NOP = 0, - RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ - RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ - RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ - RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ - RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ - RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - /* 9, 10 - reserved for BKL commemoration */ - RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ - RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ - RTTEST_RESET = 99, /* 99 Reset all pending operations */ -}; - -static int handle_op(struct test_thread_data *td, int lockwakeup) -{ - int i, id, ret = -EINVAL; - - switch(td->opcode) { - - case RTTEST_NOP: - return 0; - - case RTTEST_LOCKCONT: - td->mutexes[td->opdata] = 1; - td->event = atomic_add_return(1, &rttest_event); - return 0; - - case RTTEST_RESET: - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { - if (td->mutexes[i] == 4) { - rt_mutex_unlock(&mutexes[i]); - td->mutexes[i] = 0; - } - } - return 0; - - case RTTEST_RESETEVENT: - atomic_set(&rttest_event, 0); - return 0; - - default: - if (lockwakeup) - return ret; - } - - switch(td->opcode) { - - case RTTEST_LOCK: - case RTTEST_LOCKNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_lock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 4; - return 0; - - case RTTEST_LOCKINT: - case RTTEST_LOCKINTNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - ret = rt_mutex_lock_interruptible(&mutexes[id], 0); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = ret ? 0 : 4; - return ret ? -EINTR : 0; - - case RTTEST_UNLOCK: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) - return ret; - - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_unlock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 0; - return 0; - - default: - break; - } - return ret; -} - -/* - * Schedule replacement for rtsem_down(). Only called for threads with - * PF_MUTEX_TESTER set. - * - * This allows us to have finegrained control over the event flow. - * - */ -void schedule_rt_mutex_test(struct rt_mutex *mutex) -{ - int tid, op, dat; - struct test_thread_data *td; - - /* We have to lookup the task */ - for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { - if (threads[tid] == current) - break; - } - - BUG_ON(tid == MAX_RT_TEST_THREADS); - - td = &thread_data[tid]; - - op = td->opcode; - dat = td->opdata; - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - break; - - if (td->mutexes[dat] != 1) - break; - - td->mutexes[dat] = 2; - td->event = atomic_add_return(1, &rttest_event); - break; - - default: - break; - } - - schedule(); - - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 3; - td->event = atomic_add_return(1, &rttest_event); - break; - - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 1; - td->event = atomic_add_return(1, &rttest_event); - return; - - default: - return; - } - - td->opcode = 0; - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - int ret; - - set_current_state(TASK_RUNNING); - ret = handle_op(td, 1); - set_current_state(TASK_INTERRUPTIBLE); - if (td->opcode == RTTEST_LOCKCONT) - break; - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - } - - /* Restore previous command and data */ - td->opcode = op; - td->opdata = dat; -} - -static int test_func(void *data) -{ - struct test_thread_data *td = data; - int ret; - - current->flags |= PF_MUTEX_TESTER; - set_freezable(); - allow_signal(SIGHUP); - - for(;;) { - - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - set_current_state(TASK_RUNNING); - ret = handle_op(td, 0); - set_current_state(TASK_INTERRUPTIBLE); - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - try_to_freeze(); - - if (signal_pending(current)) - flush_signals(current); - - if(kthread_should_stop()) - break; - } - return 0; -} - -/** - * sysfs_test_command - interface for test commands - * @dev: thread reference - * @buf: command for actual step - * @count: length of buffer - * - * command syntax: - * - * opcode:data - */ -static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct sched_param schedpar; - struct test_thread_data *td; - char cmdbuf[32]; - int op, dat, tid, ret; - - td = container_of(dev, struct test_thread_data, dev); - tid = td->dev.id; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(cmdbuf)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; - if (count < 1) - return -EINVAL; - - memcpy(cmdbuf, buf, count); - cmdbuf[count] = 0; - - if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) - return -EINVAL; - - switch (op) { - case RTTEST_SCHEDOT: - schedpar.sched_priority = 0; - ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); - if (ret) - return ret; - set_user_nice(current, 0); - break; - - case RTTEST_SCHEDRT: - schedpar.sched_priority = dat; - ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); - if (ret) - return ret; - break; - - case RTTEST_SIGNAL: - send_sig(SIGHUP, threads[tid], 0); - break; - - default: - if (td->opcode > 0) - return -EBUSY; - td->opdata = dat; - td->opcode = op; - wake_up_process(threads[tid]); - } - - return count; -} - -/** - * sysfs_test_status - sysfs interface for rt tester - * @dev: thread to query - * @buf: char buffer to be filled with thread status info - */ -static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct test_thread_data *td; - struct task_struct *tsk; - char *curr = buf; - int i; - - td = container_of(dev, struct test_thread_data, dev); - tsk = threads[td->dev.id]; - - spin_lock(&rttest_lock); - - curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", - td->opcode, td->event, tsk->state, - (MAX_RT_PRIO - 1) - tsk->prio, - (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on); - - for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) - curr += sprintf(curr, "%d", td->mutexes[i]); - - spin_unlock(&rttest_lock); - - curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->dev.id].owner); - - return curr - buf; -} - -static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); -static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); - -static struct bus_type rttest_subsys = { - .name = "rttest", - .dev_name = "rttest", -}; - -static int init_test_thread(int id) -{ - thread_data[id].dev.bus = &rttest_subsys; - thread_data[id].dev.id = id; - - threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); - if (IS_ERR(threads[id])) - return PTR_ERR(threads[id]); - - return device_register(&thread_data[id].dev); -} - -static int init_rttest(void) -{ - int ret, i; - - spin_lock_init(&rttest_lock); - - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) - rt_mutex_init(&mutexes[i]); - - ret = subsys_system_register(&rttest_subsys, NULL); - if (ret) - return ret; - - for (i = 0; i < MAX_RT_TEST_THREADS; i++) { - ret = init_test_thread(i); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_status); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_command); - if (ret) - break; - } - - printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); - - return ret; -} - -device_initcall(init_rttest); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c deleted file mode 100644 index 0dd6aec1cb6a..000000000000 --- a/kernel/rtmutex.c +++ /dev/null @@ -1,1060 +0,0 @@ -/* - * RT-Mutexes: simple blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner. - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner - * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt - * Copyright (C) 2006 Esben Nielsen - * - * See Documentation/rt-mutex-design.txt for details. - */ -#include -#include -#include -#include -#include - -#include "rtmutex_common.h" - -/* - * lock->owner state tracking: - * - * lock->owner holds the task_struct pointer of the owner. Bit 0 - * is used to keep track of the "lock has waiters" state. - * - * owner bit0 - * NULL 0 lock is free (fast acquire possible) - * NULL 1 lock is free and has waiters and the top waiter - * is going to take the lock* - * taskpointer 0 lock is held (fast release possible) - * taskpointer 1 lock is held and has waiters** - * - * The fast atomic compare exchange based acquire and release is only - * possible when bit 0 of lock->owner is 0. - * - * (*) It also can be a transitional state when grabbing the lock - * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, - * we need to set the bit0 before looking at the lock, and the owner may be - * NULL in this small time, hence this can be a transitional state. - * - * (**) There is a small time when bit 0 is set but there are no - * waiters. This can happen when grabbing the lock in the slow path. - * To prevent a cmpxchg of the owner releasing the lock, we need to - * set this bit before looking at the lock. - */ - -static void -rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) -{ - unsigned long val = (unsigned long)owner; - - if (rt_mutex_has_waiters(lock)) - val |= RT_MUTEX_HAS_WAITERS; - - lock->owner = (struct task_struct *)val; -} - -static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) -{ - lock->owner = (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); -} - -static void fixup_rt_mutex_waiters(struct rt_mutex *lock) -{ - if (!rt_mutex_has_waiters(lock)) - clear_rt_mutex_waiters(lock); -} - -/* - * We can speed up the acquire/release, if the architecture - * supports cmpxchg and if there's no debugging state to be set up - */ -#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) -# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ - unsigned long owner, *p = (unsigned long *) &lock->owner; - - do { - owner = *p; - } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); -} -#else -# define rt_mutex_cmpxchg(l,c,n) (0) -static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) -{ - lock->owner = (struct task_struct *) - ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); -} -#endif - -/* - * Calculate task priority from the waiter list priority - * - * Return task->normal_prio when the waiter list is empty or when - * the waiter is not allowed to do priority boosting - */ -int rt_mutex_getprio(struct task_struct *task) -{ - if (likely(!task_has_pi_waiters(task))) - return task->normal_prio; - - return min(task_top_pi_waiter(task)->pi_list_entry.prio, - task->normal_prio); -} - -/* - * Adjust the priority of a task, after its pi_waiters got modified. - * - * This can be both boosting and unboosting. task->pi_lock must be held. - */ -static void __rt_mutex_adjust_prio(struct task_struct *task) -{ - int prio = rt_mutex_getprio(task); - - if (task->prio != prio) - rt_mutex_setprio(task, prio); -} - -/* - * Adjust task priority (undo boosting). Called from the exit path of - * rt_mutex_slowunlock() and rt_mutex_slowlock(). - * - * (Note: We do this outside of the protection of lock->wait_lock to - * allow the lock to be taken while or before we readjust the priority - * of task. We do not use the spin_xx_mutex() variants here as we are - * outside of the debug path.) - */ -static void rt_mutex_adjust_prio(struct task_struct *task) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - __rt_mutex_adjust_prio(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); -} - -/* - * Max number of times we'll walk the boosting chain: - */ -int max_lock_depth = 1024; - -/* - * Adjust the priority chain. Also used for deadlock detection. - * Decreases task's usage by one - may thus free the task. - * - * @task: the task owning the mutex (owner) for which a chain walk is probably - * needed - * @deadlock_detect: do we have to carry out deadlock detection? - * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck - * things for a task that has just got its priority adjusted, and - * is waiting on a mutex) - * @orig_waiter: rt_mutex_waiter struct for the task that has just donated - * its priority to the mutex owner (can be NULL in the case - * depicted above or if the top waiter is gone away and we are - * actually deboosting the owner) - * @top_task: the current top waiter - * - * Returns 0 or -EDEADLK. - */ -static int rt_mutex_adjust_prio_chain(struct task_struct *task, - int deadlock_detect, - struct rt_mutex *orig_lock, - struct rt_mutex_waiter *orig_waiter, - struct task_struct *top_task) -{ - struct rt_mutex *lock; - struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; - int detect_deadlock, ret = 0, depth = 0; - unsigned long flags; - - detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, - deadlock_detect); - - /* - * The (de)boosting is a step by step approach with a lot of - * pitfalls. We want this to be preemptible and we want hold a - * maximum of two locks per step. So we have to check - * carefully whether things change under us. - */ - again: - if (++depth > max_lock_depth) { - static int prev_max; - - /* - * Print this only once. If the admin changes the limit, - * print a new message when reaching the limit again. - */ - if (prev_max != max_lock_depth) { - prev_max = max_lock_depth; - printk(KERN_WARNING "Maximum lock depth %d reached " - "task: %s (%d)\n", max_lock_depth, - top_task->comm, task_pid_nr(top_task)); - } - put_task_struct(task); - - return deadlock_detect ? -EDEADLK : 0; - } - retry: - /* - * Task can not go away as we did a get_task() before ! - */ - raw_spin_lock_irqsave(&task->pi_lock, flags); - - waiter = task->pi_blocked_on; - /* - * Check whether the end of the boosting chain has been - * reached or the state of the chain has changed while we - * dropped the locks. - */ - if (!waiter) - goto out_unlock_pi; - - /* - * Check the orig_waiter state. After we dropped the locks, - * the previous owner of the lock might have released the lock. - */ - if (orig_waiter && !rt_mutex_owner(orig_lock)) - goto out_unlock_pi; - - /* - * Drop out, when the task has no waiters. Note, - * top_waiter can be NULL, when we are in the deboosting - * mode! - */ - if (top_waiter && (!task_has_pi_waiters(task) || - top_waiter != task_top_pi_waiter(task))) - goto out_unlock_pi; - - /* - * When deadlock detection is off then we check, if further - * priority adjustment is necessary. - */ - if (!detect_deadlock && waiter->list_entry.prio == task->prio) - goto out_unlock_pi; - - lock = waiter->lock; - if (!raw_spin_trylock(&lock->wait_lock)) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - cpu_relax(); - goto retry; - } - - /* Deadlock detection */ - if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { - debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); - raw_spin_unlock(&lock->wait_lock); - ret = deadlock_detect ? -EDEADLK : 0; - goto out_unlock_pi; - } - - top_waiter = rt_mutex_top_waiter(lock); - - /* Requeue the waiter */ - plist_del(&waiter->list_entry, &lock->wait_list); - waiter->list_entry.prio = task->prio; - plist_add(&waiter->list_entry, &lock->wait_list); - - /* Release the task */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - if (!rt_mutex_owner(lock)) { - /* - * If the requeue above changed the top waiter, then we need - * to wake the new top waiter up to try to get the lock. - */ - - if (top_waiter != rt_mutex_top_waiter(lock)) - wake_up_process(rt_mutex_top_waiter(lock)->task); - raw_spin_unlock(&lock->wait_lock); - goto out_put_task; - } - put_task_struct(task); - - /* Grab the next task */ - task = rt_mutex_owner(lock); - get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); - - if (waiter == rt_mutex_top_waiter(lock)) { - /* Boost the owner */ - plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); - __rt_mutex_adjust_prio(task); - - } else if (top_waiter == waiter) { - /* Deboost the owner */ - plist_del(&waiter->pi_list_entry, &task->pi_waiters); - waiter = rt_mutex_top_waiter(lock); - waiter->pi_list_entry.prio = waiter->list_entry.prio; - plist_add(&waiter->pi_list_entry, &task->pi_waiters); - __rt_mutex_adjust_prio(task); - } - - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - top_waiter = rt_mutex_top_waiter(lock); - raw_spin_unlock(&lock->wait_lock); - - if (!detect_deadlock && waiter != top_waiter) - goto out_put_task; - - goto again; - - out_unlock_pi: - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - out_put_task: - put_task_struct(task); - - return ret; -} - -/* - * Try to take an rt-mutex - * - * Must be called with lock->wait_lock held. - * - * @lock: the lock to be acquired. - * @task: the task which wants to acquire the lock - * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) - */ -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, - struct rt_mutex_waiter *waiter) -{ - /* - * We have to be careful here if the atomic speedups are - * enabled, such that, when - * - no other waiter is on the lock - * - the lock has been released since we did the cmpxchg - * the lock can be released or taken while we are doing the - * checks and marking the lock with RT_MUTEX_HAS_WAITERS. - * - * The atomic acquire/release aware variant of - * mark_rt_mutex_waiters uses a cmpxchg loop. After setting - * the WAITERS bit, the atomic release / acquire can not - * happen anymore and lock->wait_lock protects us from the - * non-atomic case. - * - * Note, that this might set lock->owner = - * RT_MUTEX_HAS_WAITERS in the case the lock is not contended - * any more. This is fixed up when we take the ownership. - * This is the transitional state explained at the top of this file. - */ - mark_rt_mutex_waiters(lock); - - if (rt_mutex_owner(lock)) - return 0; - - /* - * It will get the lock because of one of these conditions: - * 1) there is no waiter - * 2) higher priority than waiters - * 3) it is top waiter - */ - if (rt_mutex_has_waiters(lock)) { - if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { - if (!waiter || waiter != rt_mutex_top_waiter(lock)) - return 0; - } - } - - if (waiter || rt_mutex_has_waiters(lock)) { - unsigned long flags; - struct rt_mutex_waiter *top; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - - /* remove the queued waiter. */ - if (waiter) { - plist_del(&waiter->list_entry, &lock->wait_list); - task->pi_blocked_on = NULL; - } - - /* - * We have to enqueue the top waiter(if it exists) into - * task->pi_waiters list. - */ - if (rt_mutex_has_waiters(lock)) { - top = rt_mutex_top_waiter(lock); - top->pi_list_entry.prio = top->list_entry.prio; - plist_add(&top->pi_list_entry, &task->pi_waiters); - } - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - } - - /* We got the lock. */ - debug_rt_mutex_lock(lock); - - rt_mutex_set_owner(lock, task); - - rt_mutex_deadlock_account_lock(lock, task); - - return 1; -} - -/* - * Task blocks on lock. - * - * Prepare waiter and propagate pi chain - * - * This must be called with lock->wait_lock held. - */ -static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task, - int detect_deadlock) -{ - struct task_struct *owner = rt_mutex_owner(lock); - struct rt_mutex_waiter *top_waiter = waiter; - unsigned long flags; - int chain_walk = 0, res; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - __rt_mutex_adjust_prio(task); - waiter->task = task; - waiter->lock = lock; - plist_node_init(&waiter->list_entry, task->prio); - plist_node_init(&waiter->pi_list_entry, task->prio); - - /* Get the top priority waiter on the lock */ - if (rt_mutex_has_waiters(lock)) - top_waiter = rt_mutex_top_waiter(lock); - plist_add(&waiter->list_entry, &lock->wait_list); - - task->pi_blocked_on = waiter; - - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - if (!owner) - return 0; - - if (waiter == rt_mutex_top_waiter(lock)) { - raw_spin_lock_irqsave(&owner->pi_lock, flags); - plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); - plist_add(&waiter->pi_list_entry, &owner->pi_waiters); - - __rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) - chain_walk = 1; - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); - } - else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) - chain_walk = 1; - - if (!chain_walk) - return 0; - - /* - * The owner can't disappear while holding a lock, - * so the owner struct is protected by wait_lock. - * Gets dropped in rt_mutex_adjust_prio_chain()! - */ - get_task_struct(owner); - - raw_spin_unlock(&lock->wait_lock); - - res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - task); - - raw_spin_lock(&lock->wait_lock); - - return res; -} - -/* - * Wake up the next waiter on the lock. - * - * Remove the top waiter from the current tasks waiter list and wake it up. - * - * Called with lock->wait_lock held. - */ -static void wakeup_next_waiter(struct rt_mutex *lock) -{ - struct rt_mutex_waiter *waiter; - unsigned long flags; - - raw_spin_lock_irqsave(¤t->pi_lock, flags); - - waiter = rt_mutex_top_waiter(lock); - - /* - * Remove it from current->pi_waiters. We do not adjust a - * possible priority boost right now. We execute wakeup in the - * boosted mode and go back to normal after releasing - * lock->wait_lock. - */ - plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); - - rt_mutex_set_owner(lock, NULL); - - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - - wake_up_process(waiter->task); -} - -/* - * Remove a waiter from a lock and give up - * - * Must be called with lock->wait_lock held and - * have just failed to try_to_take_rt_mutex(). - */ -static void remove_waiter(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter) -{ - int first = (waiter == rt_mutex_top_waiter(lock)); - struct task_struct *owner = rt_mutex_owner(lock); - unsigned long flags; - int chain_walk = 0; - - raw_spin_lock_irqsave(¤t->pi_lock, flags); - plist_del(&waiter->list_entry, &lock->wait_list); - current->pi_blocked_on = NULL; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - - if (!owner) - return; - - if (first) { - - raw_spin_lock_irqsave(&owner->pi_lock, flags); - - plist_del(&waiter->pi_list_entry, &owner->pi_waiters); - - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; - - next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &owner->pi_waiters); - } - __rt_mutex_adjust_prio(owner); - - if (owner->pi_blocked_on) - chain_walk = 1; - - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); - } - - WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - - if (!chain_walk) - return; - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(owner); - - raw_spin_unlock(&lock->wait_lock); - - rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); - - raw_spin_lock(&lock->wait_lock); -} - -/* - * Recheck the pi chain, in case we got a priority setting - * - * Called from sched_setscheduler - */ -void rt_mutex_adjust_pi(struct task_struct *task) -{ - struct rt_mutex_waiter *waiter; - unsigned long flags; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - - waiter = task->pi_blocked_on; - if (!waiter || waiter->list_entry.prio == task->prio) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - return; - } - - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - - /* gets dropped in rt_mutex_adjust_prio_chain()! */ - get_task_struct(task); - rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); -} - -/** - * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop - * @lock: the rt_mutex to take - * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) - * @timeout: the pre-initialized and started timer, or NULL for none - * @waiter: the pre-initialized rt_mutex_waiter - * - * lock->wait_lock must be held by the caller. - */ -static int __sched -__rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) -{ - int ret = 0; - - for (;;) { - /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock, current, waiter)) - break; - - /* - * TASK_INTERRUPTIBLE checks for signals and - * timeout. Ignored otherwise. - */ - if (unlikely(state == TASK_INTERRUPTIBLE)) { - /* Signal pending? */ - if (signal_pending(current)) - ret = -EINTR; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - if (ret) - break; - } - - raw_spin_unlock(&lock->wait_lock); - - debug_rt_mutex_print_deadlock(waiter); - - schedule_rt_mutex(lock); - - raw_spin_lock(&lock->wait_lock); - set_current_state(state); - } - - return ret; -} - -/* - * Slow path lock function: - */ -static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock) -{ - struct rt_mutex_waiter waiter; - int ret = 0; - - debug_rt_mutex_init_waiter(&waiter); - - raw_spin_lock(&lock->wait_lock); - - /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock(&lock->wait_lock); - return 0; - } - - set_current_state(state); - - /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) { - hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } - - ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); - - if (likely(!ret)) - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); - - set_current_state(TASK_RUNNING); - - if (unlikely(ret)) - remove_waiter(lock, &waiter); - - /* - * try_to_take_rt_mutex() sets the waiter bit - * unconditionally. We might have to fix that up. - */ - fixup_rt_mutex_waiters(lock); - - raw_spin_unlock(&lock->wait_lock); - - /* Remove pending timer: */ - if (unlikely(timeout)) - hrtimer_cancel(&timeout->timer); - - debug_rt_mutex_free_waiter(&waiter); - - return ret; -} - -/* - * Slow path try-lock function: - */ -static inline int -rt_mutex_slowtrylock(struct rt_mutex *lock) -{ - int ret = 0; - - raw_spin_lock(&lock->wait_lock); - - if (likely(rt_mutex_owner(lock) != current)) { - - ret = try_to_take_rt_mutex(lock, current, NULL); - /* - * try_to_take_rt_mutex() sets the lock waiters - * bit unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); - } - - raw_spin_unlock(&lock->wait_lock); - - return ret; -} - -/* - * Slow path to release a rt-mutex: - */ -static void __sched -rt_mutex_slowunlock(struct rt_mutex *lock) -{ - raw_spin_lock(&lock->wait_lock); - - debug_rt_mutex_unlock(lock); - - rt_mutex_deadlock_account_unlock(current); - - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); - return; - } - - wakeup_next_waiter(lock); - - raw_spin_unlock(&lock->wait_lock); - - /* Undo pi boosting if necessary: */ - rt_mutex_adjust_prio(current); -} - -/* - * debug aware fast / slowpath lock,trylock,unlock - * - * The atomic acquire/release ops are compiled away, when either the - * architecture does not support cmpxchg or when debugging is enabled. - */ -static inline int -rt_mutex_fastlock(struct rt_mutex *lock, int state, - int detect_deadlock, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock)) -{ - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); - return 0; - } else - return slowfn(lock, state, NULL, detect_deadlock); -} - -static inline int -rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, int detect_deadlock, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - int detect_deadlock)) -{ - if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); - return 0; - } else - return slowfn(lock, state, timeout, detect_deadlock); -} - -static inline int -rt_mutex_fasttrylock(struct rt_mutex *lock, - int (*slowfn)(struct rt_mutex *lock)) -{ - if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); - return 1; - } - return slowfn(lock); -} - -static inline void -rt_mutex_fastunlock(struct rt_mutex *lock, - void (*slowfn)(struct rt_mutex *lock)) -{ - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) - rt_mutex_deadlock_account_unlock(current); - else - slowfn(lock); -} - -/** - * rt_mutex_lock - lock a rt_mutex - * - * @lock: the rt_mutex to be locked - */ -void __sched rt_mutex_lock(struct rt_mutex *lock) -{ - might_sleep(); - - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock); - -/** - * rt_mutex_lock_interruptible - lock a rt_mutex interruptible - * - * @lock: the rt_mutex to be locked - * @detect_deadlock: deadlock detection on/off - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -EDEADLK when the lock would deadlock (when deadlock detection is on) - */ -int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, - int detect_deadlock) -{ - might_sleep(); - - return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, - detect_deadlock, rt_mutex_slowlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); - -/** - * rt_mutex_timed_lock - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller - * - * @lock: the rt_mutex to be locked - * @timeout: timeout structure or NULL (no timeout) - * @detect_deadlock: deadlock detection on/off - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -ETIMEDOUT when the timeout expired - * -EDEADLK when the lock would deadlock (when deadlock detection is on) - */ -int -rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, - int detect_deadlock) -{ - might_sleep(); - - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - detect_deadlock, rt_mutex_slowlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); - -/** - * rt_mutex_trylock - try to lock a rt_mutex - * - * @lock: the rt_mutex to be locked - * - * Returns 1 on success and 0 on contention - */ -int __sched rt_mutex_trylock(struct rt_mutex *lock) -{ - return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); -} -EXPORT_SYMBOL_GPL(rt_mutex_trylock); - -/** - * rt_mutex_unlock - unlock a rt_mutex - * - * @lock: the rt_mutex to be unlocked - */ -void __sched rt_mutex_unlock(struct rt_mutex *lock) -{ - rt_mutex_fastunlock(lock, rt_mutex_slowunlock); -} -EXPORT_SYMBOL_GPL(rt_mutex_unlock); - -/** - * rt_mutex_destroy - mark a mutex unusable - * @lock: the mutex to be destroyed - * - * This function marks the mutex uninitialized, and any subsequent - * use of the mutex is forbidden. The mutex must not be locked when - * this function is called. - */ -void rt_mutex_destroy(struct rt_mutex *lock) -{ - WARN_ON(rt_mutex_is_locked(lock)); -#ifdef CONFIG_DEBUG_RT_MUTEXES - lock->magic = NULL; -#endif -} - -EXPORT_SYMBOL_GPL(rt_mutex_destroy); - -/** - * __rt_mutex_init - initialize the rt lock - * - * @lock: the rt lock to be initialized - * - * Initialize the rt lock to unlocked state. - * - * Initializing of a locked rt lock is not allowed - */ -void __rt_mutex_init(struct rt_mutex *lock, const char *name) -{ - lock->owner = NULL; - raw_spin_lock_init(&lock->wait_lock); - plist_head_init(&lock->wait_list); - - debug_rt_mutex_init(lock, name); -} -EXPORT_SYMBOL_GPL(__rt_mutex_init); - -/** - * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a - * proxy owner - * - * @lock: the rt_mutex to be locked - * @proxy_owner:the task to set as owner - * - * No locking. Caller has to do serializing itself - * Special API call for PI-futex support - */ -void rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner) -{ - __rt_mutex_init(lock, NULL); - debug_rt_mutex_proxy_lock(lock, proxy_owner); - rt_mutex_set_owner(lock, proxy_owner); - rt_mutex_deadlock_account_lock(lock, proxy_owner); -} - -/** - * rt_mutex_proxy_unlock - release a lock on behalf of owner - * - * @lock: the rt_mutex to be locked - * - * No locking. Caller has to do serializing itself - * Special API call for PI-futex support - */ -void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner) -{ - debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL); - rt_mutex_deadlock_account_unlock(proxy_owner); -} - -/** - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * @detect_deadlock: perform deadlock detection (1) or not (0) - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for FUTEX_REQUEUE_PI support. - */ -int rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task, int detect_deadlock) -{ - int ret; - - raw_spin_lock(&lock->wait_lock); - - if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock(&lock->wait_lock); - return 1; - } - - ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); - - if (ret && !rt_mutex_owner(lock)) { - /* - * Reset the return value. We might have - * returned with -EDEADLK and the owner - * released the lock while we were walking the - * pi chain. Let the waiter sort it out. - */ - ret = 0; - } - - if (unlikely(ret)) - remove_waiter(lock, waiter); - - raw_spin_unlock(&lock->wait_lock); - - debug_rt_mutex_print_deadlock(waiter); - - return ret; -} - -/** - * rt_mutex_next_owner - return the next owner of the lock - * - * @lock: the rt lock query - * - * Returns the next owner of the lock or NULL - * - * Caller has to serialize against other accessors to the lock - * itself. - * - * Special API call for PI-futex support - */ -struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) -{ - if (!rt_mutex_has_waiters(lock)) - return NULL; - - return rt_mutex_top_waiter(lock)->task; -} - -/** - * rt_mutex_finish_proxy_lock() - Complete lock acquisition - * @lock: the rt_mutex we were woken on - * @to: the timeout, null if none. hrtimer should already have - * been started. - * @waiter: the pre-initialized rt_mutex_waiter - * @detect_deadlock: perform deadlock detection (1) or not (0) - * - * Complete the lock acquisition started our behalf by another thread. - * - * Returns: - * 0 - success - * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK - * - * Special API call for PI-futex requeue support - */ -int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter, - int detect_deadlock) -{ - int ret; - - raw_spin_lock(&lock->wait_lock); - - set_current_state(TASK_INTERRUPTIBLE); - - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - - set_current_state(TASK_RUNNING); - - if (unlikely(ret)) - remove_waiter(lock, waiter); - - /* - * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might - * have to fix that up. - */ - fixup_rt_mutex_waiters(lock); - - raw_spin_unlock(&lock->wait_lock); - - return ret; -} diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h deleted file mode 100644 index a1a1dd06421d..000000000000 --- a/kernel/rtmutex.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * RT-Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner - * - * This file contains macros used solely by rtmutex.c. - * Non-debug version. - */ - -#define rt_mutex_deadlock_check(l) (0) -#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) -#define rt_mutex_deadlock_account_unlock(l) do { } while (0) -#define debug_rt_mutex_init_waiter(w) do { } while (0) -#define debug_rt_mutex_free_waiter(w) do { } while (0) -#define debug_rt_mutex_lock(l) do { } while (0) -#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) -#define debug_rt_mutex_proxy_unlock(l) do { } while (0) -#define debug_rt_mutex_unlock(l) do { } while (0) -#define debug_rt_mutex_init(m, n) do { } while (0) -#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) -#define debug_rt_mutex_print_deadlock(w) do { } while (0) -#define debug_rt_mutex_detect_deadlock(w,d) (d) -#define debug_rt_mutex_reset_waiter(w) do { } while (0) diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h deleted file mode 100644 index 53a66c85261b..000000000000 --- a/kernel/rtmutex_common.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - * RT Mutexes: blocking mutual exclusion locks with PI support - * - * started by Ingo Molnar and Thomas Gleixner: - * - * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner - * - * This file contains the private data structure and API definitions. - */ - -#ifndef __KERNEL_RTMUTEX_COMMON_H -#define __KERNEL_RTMUTEX_COMMON_H - -#include - -/* - * The rtmutex in kernel tester is independent of rtmutex debugging. We - * call schedule_rt_mutex_test() instead of schedule() for the tasks which - * belong to the tester. That way we can delay the wakeup path of those - * threads to provoke lock stealing and testing of complex boosting scenarios. - */ -#ifdef CONFIG_RT_MUTEX_TESTER - -extern void schedule_rt_mutex_test(struct rt_mutex *lock); - -#define schedule_rt_mutex(_lock) \ - do { \ - if (!(current->flags & PF_MUTEX_TESTER)) \ - schedule(); \ - else \ - schedule_rt_mutex_test(_lock); \ - } while (0) - -#else -# define schedule_rt_mutex(_lock) schedule() -#endif - -/* - * This is the control structure for tasks blocked on a rt_mutex, - * which is allocated on the kernel stack on of the blocked task. - * - * @list_entry: pi node to enqueue into the mutex waiters list - * @pi_list_entry: pi node to enqueue into the mutex owner waiters list - * @task: task reference to the blocked task - */ -struct rt_mutex_waiter { - struct plist_node list_entry; - struct plist_node pi_list_entry; - struct task_struct *task; - struct rt_mutex *lock; -#ifdef CONFIG_DEBUG_RT_MUTEXES - unsigned long ip; - struct pid *deadlock_task_pid; - struct rt_mutex *deadlock_lock; -#endif -}; - -/* - * Various helpers to access the waiters-plist: - */ -static inline int rt_mutex_has_waiters(struct rt_mutex *lock) -{ - return !plist_head_empty(&lock->wait_list); -} - -static inline struct rt_mutex_waiter * -rt_mutex_top_waiter(struct rt_mutex *lock) -{ - struct rt_mutex_waiter *w; - - w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, - list_entry); - BUG_ON(w->lock != lock); - - return w; -} - -static inline int task_has_pi_waiters(struct task_struct *p) -{ - return !plist_head_empty(&p->pi_waiters); -} - -static inline struct rt_mutex_waiter * -task_top_pi_waiter(struct task_struct *p) -{ - return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, - pi_list_entry); -} - -/* - * lock->owner state tracking: - */ -#define RT_MUTEX_HAS_WAITERS 1UL -#define RT_MUTEX_OWNER_MASKALL 1UL - -static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) -{ - return (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); -} - -/* - * PI-futex support (proxy locking functions, etc.): - */ -extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); -extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, - struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner); -extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, - struct rt_mutex_waiter *waiter, - struct task_struct *task, - int detect_deadlock); -extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter, - int detect_deadlock); - -#ifdef CONFIG_DEBUG_RT_MUTEXES -# include "rtmutex-debug.h" -#else -# include "rtmutex.h" -#endif - -#endif -- cgit v1.3 From ed428bfc3caaa4b1e6cd15ea12c90c30291903f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 Oct 2013 18:19:28 +0100 Subject: locking: Move the rwsem code to kernel/locking/ Notably: changed lib/rwsem* targets from lib- to obj-, no idea about the ramifications of that. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-g0kynfh5feriwc6p3h6kpbw6@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/Makefile | 2 +- kernel/locking/Makefile | 4 +- kernel/locking/rwsem-spinlock.c | 296 ++++++++++++++++++++++++++++++++++++++++ kernel/locking/rwsem-xadd.c | 293 +++++++++++++++++++++++++++++++++++++++ kernel/locking/rwsem.c | 157 +++++++++++++++++++++ kernel/rwsem.c | 157 --------------------- lib/Makefile | 2 - lib/rwsem-spinlock.c | 296 ---------------------------------------- lib/rwsem.c | 293 --------------------------------------- 9 files changed, 750 insertions(+), 750 deletions(-) create mode 100644 kernel/locking/rwsem-spinlock.c create mode 100644 kernel/locking/rwsem-xadd.c create mode 100644 kernel/locking/rwsem.c delete mode 100644 kernel/rwsem.c delete mode 100644 lib/rwsem-spinlock.c delete mode 100644 lib/rwsem.c (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index 9c2ad1852223..1aef002aa56b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ extable.o params.o posix-timers.o \ kthread.o sys_ni.o posix-cpu-timers.o \ - hrtimer.o rwsem.o nsproxy.o \ + hrtimer.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o groups.o lglock.o smpboot.o diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 59f66dec2bf9..b0e0d73516e3 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o +obj-y += mutex.o semaphore.o rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg @@ -20,3 +20,5 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o \ No newline at end of file diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c new file mode 100644 index 000000000000..9be8a9144978 --- /dev/null +++ b/kernel/locking/rwsem-spinlock.c @@ -0,0 +1,296 @@ +/* rwsem-spinlock.c: R/W semaphores: contention handling functions for + * generic spinlock implementation + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli + * - Derived also from comments by Linus + */ +#include +#include +#include + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; +}; + +int rwsem_is_locked(struct rw_semaphore *sem) +{ + int ret = 1; + unsigned long flags; + + if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { + ret = (sem->activity != 0); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + } + return ret; +} +EXPORT_SYMBOL(rwsem_is_locked); + +/* + * initialise the semaphore + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + sem->activity = 0; + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} +EXPORT_SYMBOL(__init_rwsem); + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here, then: + * - the 'active count' _reached_ zero + * - the 'waiting count' is non-zero + * - the spinlock must be held by the caller + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if wakewrite is non-zero + */ +static inline struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +{ + struct rwsem_waiter *waiter; + struct task_struct *tsk; + int woken; + + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wakewrite) + /* Wake up a writer. Note that we do not grant it the + * lock - it will have to acquire it when it runs. */ + wake_up_process(waiter->task); + goto out; + } + + /* grant an infinite number of read locks to the front of the queue */ + woken = 0; + do { + struct list_head *next = waiter->list.next; + + list_del(&waiter->list); + tsk = waiter->task; + smp_mb(); + waiter->task = NULL; + wake_up_process(tsk); + put_task_struct(tsk); + woken++; + if (next == &sem->wait_list) + break; + waiter = list_entry(next, struct rwsem_waiter, list); + } while (waiter->type != RWSEM_WAITING_FOR_WRITE); + + sem->activity += woken; + + out: + return sem; +} + +/* + * wake a single writer + */ +static inline struct rw_semaphore * +__rwsem_wake_one_writer(struct rw_semaphore *sem) +{ + struct rwsem_waiter *waiter; + + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + wake_up_process(waiter->task); + + return sem; +} + +/* + * get a read lock on the semaphore + */ +void __sched __down_read(struct rw_semaphore *sem) +{ + struct rwsem_waiter waiter; + struct task_struct *tsk; + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + /* granted */ + sem->activity++; + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + goto out; + } + + tsk = current; + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_READ; + get_task_struct(tsk); + + list_add_tail(&waiter.list, &sem->wait_list); + + /* we don't need to touch the semaphore struct anymore */ + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + /* wait to be given the lock */ + for (;;) { + if (!waiter.task) + break; + schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + } + + tsk->state = TASK_RUNNING; + out: + ; +} + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int __down_read_trylock(struct rw_semaphore *sem) +{ + unsigned long flags; + int ret = 0; + + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + /* granted */ + sem->activity++; + ret = 1; + } + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; +} + +/* + * get a write lock on the semaphore + */ +void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +{ + struct rwsem_waiter waiter; + struct task_struct *tsk; + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + /* set up my own style of waitqueue */ + tsk = current; + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_WRITE; + list_add_tail(&waiter.list, &sem->wait_list); + + /* wait for someone to release the lock */ + for (;;) { + /* + * That is the key to support write lock stealing: allows the + * task already on CPU to get the lock soon rather than put + * itself into sleep and waiting for system woke it or someone + * else in the head of the wait list up. + */ + if (sem->activity == 0) + break; + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + schedule(); + raw_spin_lock_irqsave(&sem->wait_lock, flags); + } + /* got the lock */ + sem->activity = -1; + list_del(&waiter.list); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +void __sched __down_write(struct rw_semaphore *sem) +{ + __down_write_nested(sem, 0); +} + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +int __down_write_trylock(struct rw_semaphore *sem) +{ + unsigned long flags; + int ret = 0; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (sem->activity == 0) { + /* got the lock */ + sem->activity = -1; + ret = 1; + } + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return ret; +} + +/* + * release a read lock on the semaphore + */ +void __up_read(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (--sem->activity == 0 && !list_empty(&sem->wait_list)) + sem = __rwsem_wake_one_writer(sem); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +/* + * release a write lock on the semaphore + */ +void __up_write(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + sem->activity = 0; + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, 1); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void __downgrade_write(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + sem->activity = 1; + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, 0); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c new file mode 100644 index 000000000000..19c5fa95e0b4 --- /dev/null +++ b/kernel/locking/rwsem-xadd.c @@ -0,0 +1,293 @@ +/* rwsem.c: R/W semaphores: contention handling functions + * + * Written by David Howells (dhowells@redhat.com). + * Derived from arch/i386/kernel/semaphore.c + * + * Writer lock-stealing by Alex Shi + * and Michel Lespinasse + */ +#include +#include +#include +#include + +/* + * Initialize an rwsem: + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + sem->count = RWSEM_UNLOCKED_VALUE; + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); +} + +EXPORT_SYMBOL(__init_rwsem); + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; +}; + +enum rwsem_wake_type { + RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ + RWSEM_WAKE_READERS, /* Wake readers only */ + RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ +}; + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here from up_xxxx(), then: + * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) + * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) + * - there must be someone on the queue + * - the spinlock must be held by the caller + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if downgrading is false + */ +static struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +{ + struct rwsem_waiter *waiter; + struct task_struct *tsk; + struct list_head *next; + long oldcount, woken, loop, adjustment; + + waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wake_type == RWSEM_WAKE_ANY) + /* Wake writer at the front of the queue, but do not + * grant it the lock yet as we want other writers + * to be able to steal it. Readers, on the other hand, + * will block as they will notice the queued writer. + */ + wake_up_process(waiter->task); + goto out; + } + + /* Writers might steal the lock before we grant it to the next reader. + * We prefer to do the first reader grant before counting readers + * so we can bail out early if a writer stole the lock. + */ + adjustment = 0; + if (wake_type != RWSEM_WAKE_READ_OWNED) { + adjustment = RWSEM_ACTIVE_READ_BIAS; + try_reader_grant: + oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { + /* A writer stole the lock. Undo our reader grant. */ + if (rwsem_atomic_update(-adjustment, sem) & + RWSEM_ACTIVE_MASK) + goto out; + /* Last active locker left. Retry waking readers. */ + goto try_reader_grant; + } + } + + /* Grant an infinite number of read locks to the readers at the front + * of the queue. Note we increment the 'active part' of the count by + * the number of readers before waking any processes up. + */ + woken = 0; + do { + woken++; + + if (waiter->list.next == &sem->wait_list) + break; + + waiter = list_entry(waiter->list.next, + struct rwsem_waiter, list); + + } while (waiter->type != RWSEM_WAITING_FOR_WRITE); + + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + if (waiter->type != RWSEM_WAITING_FOR_WRITE) + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; + + if (adjustment) + rwsem_atomic_add(adjustment, sem); + + next = sem->wait_list.next; + loop = woken; + do { + waiter = list_entry(next, struct rwsem_waiter, list); + next = waiter->list.next; + tsk = waiter->task; + smp_mb(); + waiter->task = NULL; + wake_up_process(tsk); + put_task_struct(tsk); + } while (--loop); + + sem->wait_list.next = next; + next->prev = &sem->wait_list; + + out: + return sem; +} + +/* + * wait for the read lock to be granted + */ +struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +{ + long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; + struct rwsem_waiter waiter; + struct task_struct *tsk = current; + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_READ; + get_task_struct(tsk); + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) + adjustment += RWSEM_WAITING_BIAS; + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = rwsem_atomic_update(adjustment, sem); + + /* If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (count == RWSEM_WAITING_BIAS || + (count > RWSEM_WAITING_BIAS && + adjustment != -RWSEM_ACTIVE_READ_BIAS)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + + raw_spin_unlock_irq(&sem->wait_lock); + + /* wait to be given the lock */ + while (true) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!waiter.task) + break; + schedule(); + } + + tsk->state = TASK_RUNNING; + + return sem; +} + +/* + * wait until we successfully acquire the write lock + */ +struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +{ + long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + struct rwsem_waiter waiter; + struct task_struct *tsk = current; + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = RWSEM_WAITING_FOR_WRITE; + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) + adjustment += RWSEM_WAITING_BIAS; + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = rwsem_atomic_update(adjustment, sem); + + /* If there were already threads queued before us and there are no + * active writers, the lock must be read owned; so we try to wake + * any read locks that were queued ahead of us. */ + if (count > RWSEM_WAITING_BIAS && + adjustment == -RWSEM_ACTIVE_WRITE_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + + /* wait until we successfully acquire the lock */ + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + while (true) { + if (!(count & RWSEM_ACTIVE_MASK)) { + /* Try acquiring the write lock. */ + count = RWSEM_ACTIVE_WRITE_BIAS; + if (!list_is_singular(&sem->wait_list)) + count += RWSEM_WAITING_BIAS; + + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == + RWSEM_WAITING_BIAS) + break; + } + + raw_spin_unlock_irq(&sem->wait_lock); + + /* Block until there are no active lockers. */ + do { + schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + } while ((count = sem->count) & RWSEM_ACTIVE_MASK); + + raw_spin_lock_irq(&sem->wait_lock); + } + + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); + tsk->state = TASK_RUNNING; + + return sem; +} + +/* + * handle waking up a waiter on the semaphore + * - up_read/up_write has decremented the active part of count if we come here + */ +struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + /* do nothing if list empty */ + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return sem; +} + +/* + * downgrade a write lock into a read lock + * - caller incremented waiting part of count and discovered it still negative + * - just wake up any readers at the front of the queue + */ +struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + /* do nothing if list empty */ + if (!list_empty(&sem->wait_list)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + + return sem; +} + +EXPORT_SYMBOL(rwsem_down_read_failed); +EXPORT_SYMBOL(rwsem_down_write_failed); +EXPORT_SYMBOL(rwsem_wake); +EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c new file mode 100644 index 000000000000..cfff1435bdfb --- /dev/null +++ b/kernel/locking/rwsem.c @@ -0,0 +1,157 @@ +/* kernel/rwsem.c: R/W semaphores, public implementation + * + * Written by David Howells (dhowells@redhat.com). + * Derived from asm-i386/semaphore.h + */ + +#include +#include +#include +#include +#include + +#include + +/* + * lock for reading + */ +void __sched down_read(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); +} + +EXPORT_SYMBOL(down_read); + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int down_read_trylock(struct rw_semaphore *sem) +{ + int ret = __down_read_trylock(sem); + + if (ret == 1) + rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); + return ret; +} + +EXPORT_SYMBOL(down_read_trylock); + +/* + * lock for writing + */ +void __sched down_write(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(down_write); + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +int down_write_trylock(struct rw_semaphore *sem) +{ + int ret = __down_write_trylock(sem); + + if (ret == 1) + rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); + return ret; +} + +EXPORT_SYMBOL(down_write_trylock); + +/* + * release a read lock + */ +void up_read(struct rw_semaphore *sem) +{ + rwsem_release(&sem->dep_map, 1, _RET_IP_); + + __up_read(sem); +} + +EXPORT_SYMBOL(up_read); + +/* + * release a write lock + */ +void up_write(struct rw_semaphore *sem) +{ + rwsem_release(&sem->dep_map, 1, _RET_IP_); + + __up_write(sem); +} + +EXPORT_SYMBOL(up_write); + +/* + * downgrade write lock to read lock + */ +void downgrade_write(struct rw_semaphore *sem) +{ + /* + * lockdep: a downgraded write will live on as a write + * dependency. + */ + __downgrade_write(sem); +} + +EXPORT_SYMBOL(downgrade_write); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void down_read_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); + + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); +} + +EXPORT_SYMBOL(down_read_nested); + +void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) +{ + might_sleep(); + rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(_down_write_nest_lock); + +void down_read_non_owner(struct rw_semaphore *sem) +{ + might_sleep(); + + __down_read(sem); +} + +EXPORT_SYMBOL(down_read_non_owner); + +void down_write_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(down_write_nested); + +void up_read_non_owner(struct rw_semaphore *sem) +{ + __up_read(sem); +} + +EXPORT_SYMBOL(up_read_non_owner); + +#endif + + diff --git a/kernel/rwsem.c b/kernel/rwsem.c deleted file mode 100644 index cfff1435bdfb..000000000000 --- a/kernel/rwsem.c +++ /dev/null @@ -1,157 +0,0 @@ -/* kernel/rwsem.c: R/W semaphores, public implementation - * - * Written by David Howells (dhowells@redhat.com). - * Derived from asm-i386/semaphore.h - */ - -#include -#include -#include -#include -#include - -#include - -/* - * lock for reading - */ -void __sched down_read(struct rw_semaphore *sem) -{ - might_sleep(); - rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); -} - -EXPORT_SYMBOL(down_read); - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int down_read_trylock(struct rw_semaphore *sem) -{ - int ret = __down_read_trylock(sem); - - if (ret == 1) - rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); - return ret; -} - -EXPORT_SYMBOL(down_read_trylock); - -/* - * lock for writing - */ -void __sched down_write(struct rw_semaphore *sem) -{ - might_sleep(); - rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -} - -EXPORT_SYMBOL(down_write); - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int down_write_trylock(struct rw_semaphore *sem) -{ - int ret = __down_write_trylock(sem); - - if (ret == 1) - rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); - return ret; -} - -EXPORT_SYMBOL(down_write_trylock); - -/* - * release a read lock - */ -void up_read(struct rw_semaphore *sem) -{ - rwsem_release(&sem->dep_map, 1, _RET_IP_); - - __up_read(sem); -} - -EXPORT_SYMBOL(up_read); - -/* - * release a write lock - */ -void up_write(struct rw_semaphore *sem) -{ - rwsem_release(&sem->dep_map, 1, _RET_IP_); - - __up_write(sem); -} - -EXPORT_SYMBOL(up_write); - -/* - * downgrade write lock to read lock - */ -void downgrade_write(struct rw_semaphore *sem) -{ - /* - * lockdep: a downgraded write will live on as a write - * dependency. - */ - __downgrade_write(sem); -} - -EXPORT_SYMBOL(downgrade_write); - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - -void down_read_nested(struct rw_semaphore *sem, int subclass) -{ - might_sleep(); - rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); -} - -EXPORT_SYMBOL(down_read_nested); - -void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) -{ - might_sleep(); - rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); - - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -} - -EXPORT_SYMBOL(_down_write_nest_lock); - -void down_read_non_owner(struct rw_semaphore *sem) -{ - might_sleep(); - - __down_read(sem); -} - -EXPORT_SYMBOL(down_read_non_owner); - -void down_write_nested(struct rw_semaphore *sem, int subclass) -{ - might_sleep(); - rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -} - -EXPORT_SYMBOL(down_write_nested); - -void up_read_non_owner(struct rw_semaphore *sem) -{ - __up_read(sem); -} - -EXPORT_SYMBOL(up_read_non_owner); - -#endif - - diff --git a/lib/Makefile b/lib/Makefile index bee27e1d3431..ca8cadc8fe4d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -42,8 +42,6 @@ obj-$(CONFIG_GENERIC_PCI_IOMAP) += pci_iomap.o obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c deleted file mode 100644 index 9be8a9144978..000000000000 --- a/lib/rwsem-spinlock.c +++ /dev/null @@ -1,296 +0,0 @@ -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for - * generic spinlock implementation - * - * Copyright (c) 2001 David Howells (dhowells@redhat.com). - * - Derived partially from idea by Andrea Arcangeli - * - Derived also from comments by Linus - */ -#include -#include -#include - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -int rwsem_is_locked(struct rw_semaphore *sem) -{ - int ret = 1; - unsigned long flags; - - if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->activity != 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - } - return ret; -} -EXPORT_SYMBOL(rwsem_is_locked); - -/* - * initialise the semaphore - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->activity = 0; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} -EXPORT_SYMBOL(__init_rwsem); - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here, then: - * - the 'active count' _reached_ zero - * - the 'waiting count' is non-zero - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if wakewrite is non-zero - */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - int woken; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wakewrite) - /* Wake up a writer. Note that we do not grant it the - * lock - it will have to acquire it when it runs. */ - wake_up_process(waiter->task); - goto out; - } - - /* grant an infinite number of read locks to the front of the queue */ - woken = 0; - do { - struct list_head *next = waiter->list.next; - - list_del(&waiter->list); - tsk = waiter->task; - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - woken++; - if (next == &sem->wait_list) - break; - waiter = list_entry(next, struct rwsem_waiter, list); - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - sem->activity += woken; - - out: - return sem; -} - -/* - * wake a single writer - */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) -{ - struct rwsem_waiter *waiter; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - wake_up_process(waiter->task); - - return sem; -} - -/* - * get a read lock on the semaphore - */ -void __sched __down_read(struct rw_semaphore *sem) -{ - struct rwsem_waiter waiter; - struct task_struct *tsk; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->activity++; - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - goto out; - } - - tsk = current; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - - /* set up my own style of waitqueue */ - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); - - list_add_tail(&waiter.list, &sem->wait_list); - - /* we don't need to touch the semaphore struct anymore */ - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - /* wait to be given the lock */ - for (;;) { - if (!waiter.task) - break; - schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } - - tsk->state = TASK_RUNNING; - out: - ; -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int __down_read_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->activity++; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * get a write lock on the semaphore - */ -void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) -{ - struct rwsem_waiter waiter; - struct task_struct *tsk; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* set up my own style of waitqueue */ - tsk = current; - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_WRITE; - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait for someone to release the lock */ - for (;;) { - /* - * That is the key to support write lock stealing: allows the - * task already on CPU to get the lock soon rather than put - * itself into sleep and waiting for system woke it or someone - * else in the head of the wait list up. - */ - if (sem->activity == 0) - break; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - /* got the lock */ - sem->activity = -1; - list_del(&waiter.list); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -void __sched __down_write(struct rw_semaphore *sem) -{ - __down_write_nested(sem, 0); -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int __down_write_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity == 0) { - /* got the lock */ - sem->activity = -1; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * release a read lock on the semaphore - */ -void __up_read(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (--sem->activity == 0 && !list_empty(&sem->wait_list)) - sem = __rwsem_wake_one_writer(sem); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * release a write lock on the semaphore - */ -void __up_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->activity = 0; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void __downgrade_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->activity = 1; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - diff --git a/lib/rwsem.c b/lib/rwsem.c deleted file mode 100644 index 19c5fa95e0b4..000000000000 --- a/lib/rwsem.c +++ /dev/null @@ -1,293 +0,0 @@ -/* rwsem.c: R/W semaphores: contention handling functions - * - * Written by David Howells (dhowells@redhat.com). - * Derived from arch/i386/kernel/semaphore.c - * - * Writer lock-stealing by Alex Shi - * and Michel Lespinasse - */ -#include -#include -#include -#include - -/* - * Initialize an rwsem: - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->count = RWSEM_UNLOCKED_VALUE; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} - -EXPORT_SYMBOL(__init_rwsem); - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -enum rwsem_wake_type { - RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ - RWSEM_WAKE_READERS, /* Wake readers only */ - RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ -}; - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here from up_xxxx(), then: - * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) - * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) - * - there must be someone on the queue - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if downgrading is false - */ -static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - struct list_head *next; - long oldcount, woken, loop, adjustment; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) - /* Wake writer at the front of the queue, but do not - * grant it the lock yet as we want other writers - * to be able to steal it. Readers, on the other hand, - * will block as they will notice the queued writer. - */ - wake_up_process(waiter->task); - goto out; - } - - /* Writers might steal the lock before we grant it to the next reader. - * We prefer to do the first reader grant before counting readers - * so we can bail out early if a writer stole the lock. - */ - adjustment = 0; - if (wake_type != RWSEM_WAKE_READ_OWNED) { - adjustment = RWSEM_ACTIVE_READ_BIAS; - try_reader_grant: - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* A writer stole the lock. Undo our reader grant. */ - if (rwsem_atomic_update(-adjustment, sem) & - RWSEM_ACTIVE_MASK) - goto out; - /* Last active locker left. Retry waking readers. */ - goto try_reader_grant; - } - } - - /* Grant an infinite number of read locks to the readers at the front - * of the queue. Note we increment the 'active part' of the count by - * the number of readers before waking any processes up. - */ - woken = 0; - do { - woken++; - - if (waiter->list.next == &sem->wait_list) - break; - - waiter = list_entry(waiter->list.next, - struct rwsem_waiter, list); - - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - if (waiter->type != RWSEM_WAITING_FOR_WRITE) - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - - if (adjustment) - rwsem_atomic_add(adjustment, sem); - - next = sem->wait_list.next; - loop = woken; - do { - waiter = list_entry(next, struct rwsem_waiter, list); - next = waiter->list.next; - tsk = waiter->task; - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - } while (--loop); - - sem->wait_list.next = next; - next->prev = &sem->wait_list; - - out: - return sem; -} - -/* - * wait for the read lock to be granted - */ -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) -{ - long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; - struct rwsem_waiter waiter; - struct task_struct *tsk = current; - - /* set up my own style of waitqueue */ - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); - - /* If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (count == RWSEM_WAITING_BIAS || - (count > RWSEM_WAITING_BIAS && - adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); - - raw_spin_unlock_irq(&sem->wait_lock); - - /* wait to be given the lock */ - while (true) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!waiter.task) - break; - schedule(); - } - - tsk->state = TASK_RUNNING; - - return sem; -} - -/* - * wait until we successfully acquire the write lock - */ -struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) -{ - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; - struct rwsem_waiter waiter; - struct task_struct *tsk = current; - - /* set up my own style of waitqueue */ - waiter.task = tsk; - waiter.type = RWSEM_WAITING_FOR_WRITE; - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) - adjustment += RWSEM_WAITING_BIAS; - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); - - /* If there were already threads queued before us and there are no - * active writers, the lock must be read owned; so we try to wake - * any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); - - /* wait until we successfully acquire the lock */ - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - while (true) { - if (!(count & RWSEM_ACTIVE_MASK)) { - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - - if (sem->count == RWSEM_WAITING_BIAS && - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == - RWSEM_WAITING_BIAS) - break; - } - - raw_spin_unlock_irq(&sem->wait_lock); - - /* Block until there are no active lockers. */ - do { - schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } while ((count = sem->count) & RWSEM_ACTIVE_MASK); - - raw_spin_lock_irq(&sem->wait_lock); - } - - list_del(&waiter.list); - raw_spin_unlock_irq(&sem->wait_lock); - tsk->state = TASK_RUNNING; - - return sem; -} - -/* - * handle waking up a waiter on the semaphore - * - up_read/up_write has decremented the active part of count if we come here - */ -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* do nothing if list empty */ - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return sem; -} - -/* - * downgrade a write lock into a read lock - * - caller incremented waiting part of count and discovered it still negative - * - just wake up any readers at the front of the queue - */ -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* do nothing if list empty */ - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return sem; -} - -EXPORT_SYMBOL(rwsem_down_read_failed); -EXPORT_SYMBOL(rwsem_down_write_failed); -EXPORT_SYMBOL(rwsem_wake); -EXPORT_SYMBOL(rwsem_downgrade_wake); -- cgit v1.3 From cd4d241d57c99c6b00ef1799ad797d90f75a1da9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Nov 2013 11:51:33 +0100 Subject: locking: Move the lglocks code to kernel/locking/ Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-amd6pg1mif6tikbyktfvby3y@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/Makefile | 2 +- kernel/lglock.c | 89 ------------------------------------------------- kernel/locking/Makefile | 2 +- kernel/locking/lglock.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 91 deletions(-) delete mode 100644 kernel/lglock.c create mode 100644 kernel/locking/lglock.c (limited to 'kernel/Makefile') diff --git a/kernel/Makefile b/kernel/Makefile index 1aef002aa56b..09a9c94f42bd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \ kthread.o sys_ni.o posix-cpu-timers.o \ hrtimer.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o groups.o lglock.o smpboot.o + async.o range.o groups.o smpboot.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files diff --git a/kernel/lglock.c b/kernel/lglock.c deleted file mode 100644 index 86ae2aebf004..000000000000 --- a/kernel/lglock.c +++ /dev/null @@ -1,89 +0,0 @@ -/* See include/linux/lglock.h for description */ -#include -#include -#include -#include - -/* - * Note there is no uninit, so lglocks cannot be defined in - * modules (but it's fine to use them from there) - * Could be added though, just undo lg_lock_init - */ - -void lg_lock_init(struct lglock *lg, char *name) -{ - LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); -} -EXPORT_SYMBOL(lg_lock_init); - -void lg_local_lock(struct lglock *lg) -{ - arch_spinlock_t *lock; - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - lock = this_cpu_ptr(lg->lock); - arch_spin_lock(lock); -} -EXPORT_SYMBOL(lg_local_lock); - -void lg_local_unlock(struct lglock *lg) -{ - arch_spinlock_t *lock; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - lock = this_cpu_ptr(lg->lock); - arch_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(lg_local_unlock); - -void lg_local_lock_cpu(struct lglock *lg, int cpu) -{ - arch_spinlock_t *lock; - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - lock = per_cpu_ptr(lg->lock, cpu); - arch_spin_lock(lock); -} -EXPORT_SYMBOL(lg_local_lock_cpu); - -void lg_local_unlock_cpu(struct lglock *lg, int cpu) -{ - arch_spinlock_t *lock; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - lock = per_cpu_ptr(lg->lock, cpu); - arch_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(lg_local_unlock_cpu); - -void lg_global_lock(struct lglock *lg) -{ - int i; - - preempt_disable(); - lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - for_each_possible_cpu(i) { - arch_spinlock_t *lock; - lock = per_cpu_ptr(lg->lock, i); - arch_spin_lock(lock); - } -} -EXPORT_SYMBOL(lg_global_lock); - -void lg_global_unlock(struct lglock *lg) -{ - int i; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - for_each_possible_cpu(i) { - arch_spinlock_t *lock; - lock = per_cpu_ptr(lg->lock, i); - arch_spin_unlock(lock); - } - preempt_enable(); -} -EXPORT_SYMBOL(lg_global_unlock); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index b0e0d73516e3..bdd313a3411d 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o +obj-y += mutex.o semaphore.o rwsem.o lglock.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c new file mode 100644 index 000000000000..86ae2aebf004 --- /dev/null +++ b/kernel/locking/lglock.c @@ -0,0 +1,89 @@ +/* See include/linux/lglock.h for description */ +#include +#include +#include +#include + +/* + * Note there is no uninit, so lglocks cannot be defined in + * modules (but it's fine to use them from there) + * Could be added though, just undo lg_lock_init + */ + +void lg_lock_init(struct lglock *lg, char *name) +{ + LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); +} +EXPORT_SYMBOL(lg_lock_init); + +void lg_local_lock(struct lglock *lg) +{ + arch_spinlock_t *lock; + + preempt_disable(); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); + lock = this_cpu_ptr(lg->lock); + arch_spin_lock(lock); +} +EXPORT_SYMBOL(lg_local_lock); + +void lg_local_unlock(struct lglock *lg) +{ + arch_spinlock_t *lock; + + lock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock = this_cpu_ptr(lg->lock); + arch_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(lg_local_unlock); + +void lg_local_lock_cpu(struct lglock *lg, int cpu) +{ + arch_spinlock_t *lock; + + preempt_disable(); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); + lock = per_cpu_ptr(lg->lock, cpu); + arch_spin_lock(lock); +} +EXPORT_SYMBOL(lg_local_lock_cpu); + +void lg_local_unlock_cpu(struct lglock *lg, int cpu) +{ + arch_spinlock_t *lock; + + lock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock = per_cpu_ptr(lg->lock, cpu); + arch_spin_unlock(lock); + preempt_enable(); +} +EXPORT_SYMBOL(lg_local_unlock_cpu); + +void lg_global_lock(struct lglock *lg) +{ + int i; + + preempt_disable(); + lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); + for_each_possible_cpu(i) { + arch_spinlock_t *lock; + lock = per_cpu_ptr(lg->lock, i); + arch_spin_lock(lock); + } +} +EXPORT_SYMBOL(lg_global_lock); + +void lg_global_unlock(struct lglock *lg) +{ + int i; + + lock_release(&lg->lock_dep_map, 1, _RET_IP_); + for_each_possible_cpu(i) { + arch_spinlock_t *lock; + lock = per_cpu_ptr(lg->lock, i); + arch_spin_unlock(lock); + } + preempt_enable(); +} +EXPORT_SYMBOL(lg_global_unlock); -- cgit v1.3