diff options
Diffstat (limited to 'kernel')
145 files changed, 6460 insertions, 2551 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 94fabd534b03..2a202a846757 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ  	default 1000 if HZ_1000  config SCHED_HRTICK -	def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) +	def_bool HIGH_RES_TIMERS diff --git a/kernel/Makefile b/kernel/Makefile index 1ce47553fb02..bbaf7d59c1bb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -6,56 +6,44 @@ obj-y     = fork.o exec_domain.o panic.o \  	    cpu.o exit.o itimer.o time.o softirq.o resource.o \  	    sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \  	    signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ -	    rcupdate.o extable.o params.o posix-timers.o \ -	    kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ -	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ +	    extable.o params.o posix-timers.o \ +	    kthread.o sys_ni.o posix-cpu-timers.o \ +	    hrtimer.o nsproxy.o \  	    notifier.o ksysfs.o cred.o reboot.o \ -	    async.o range.o groups.o lglock.o smpboot.o +	    async.o range.o groups.o smpboot.o  ifdef CONFIG_FUNCTION_TRACER  # Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_lockdep.o = -pg -CFLAGS_REMOVE_lockdep_proc.o = -pg -CFLAGS_REMOVE_mutex-debug.o = -pg -CFLAGS_REMOVE_rtmutex-debug.o = -pg  CFLAGS_REMOVE_cgroup-debug.o = -pg  CFLAGS_REMOVE_irq_work.o = -pg  endif  obj-y += sched/ +obj-y += locking/  obj-y += power/  obj-y += printk/  obj-y += cpu/  obj-y += irq/ +obj-y += rcu/  obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o  obj-$(CONFIG_FREEZER) += freezer.o  obj-$(CONFIG_PROFILING) += profile.o  obj-$(CONFIG_STACKTRACE) += stacktrace.o  obj-y += time/ -obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o -obj-$(CONFIG_LOCKDEP) += lockdep.o -ifeq ($(CONFIG_PROC_FS),y) -obj-$(CONFIG_LOCKDEP) += lockdep_proc.o -endif  obj-$(CONFIG_FUTEX) += futex.o  ifeq ($(CONFIG_COMPAT),y)  obj-$(CONFIG_FUTEX) += futex_compat.o  endif -obj-$(CONFIG_RT_MUTEXES) += rtmutex.o -obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o  obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o  obj-$(CONFIG_SMP) += smp.o  ifneq ($(CONFIG_SMP),y)  obj-y += up.o  endif -obj-$(CONFIG_SMP) += spinlock.o -obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o -obj-$(CONFIG_PROVE_LOCKING) += spinlock.o  obj-$(CONFIG_UID16) += uid16.o +obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o  obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o  obj-$(CONFIG_KALLSYMS) += kallsyms.o  obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o  obj-$(CONFIG_KEXEC) += kexec.o @@ -81,12 +69,6 @@ obj-$(CONFIG_KGDB) += debug/  obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o  obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o  obj-$(CONFIG_SECCOMP) += seccomp.o -obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o -obj-$(CONFIG_TREE_RCU) += rcutree.o -obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o -obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o -obj-$(CONFIG_TINY_RCU) += rcutiny.o -obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o  obj-$(CONFIG_RELAY) += relay.o  obj-$(CONFIG_SYSCTL) += utsname_sysctl.o  obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o @@ -141,19 +123,52 @@ targets += timeconst.h  $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE  	$(call if_changed,bc) -ifeq ($(CONFIG_MODULE_SIG),y) +############################################################################### +# +# Roll all the X.509 certificates that we can find together and pull them into +# the kernel so that they get loaded into the system trusted keyring during +# boot.  # -# Pull the signing certificate and any extra certificates into the kernel +# We look in the source root and the build root for all files whose name ends +# in ".x509".  Unfortunately, this will generate duplicate filenames, so we +# have make canonicalise the pathnames and then sort them to discard the +# duplicates.  # +############################################################################### +ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) +X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) +X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 +X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ +				$(or $(realpath $(CERT)),$(CERT)))) + +ifeq ($(X509_CERTIFICATES),) +$(warning *** No X.509 certificates found ***) +endif -quiet_cmd_touch = TOUCH   $@ -      cmd_touch = touch   $@ +ifneq ($(wildcard $(obj)/.x509.list),) +ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) +$(info X.509 certificate list changed) +$(shell rm $(obj)/.x509.list) +endif +endif + +kernel/system_certificates.o: $(obj)/x509_certificate_list + +quiet_cmd_x509certs  = CERTS   $@ +      cmd_x509certs  = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo "  - Including cert $(X509)") -extra_certificates: -	$(call cmd,touch) +targets += $(obj)/x509_certificate_list +$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list +	$(call if_changed,x509certs) -kernel/modsign_certificate.o: signing_key.x509 extra_certificates +targets += $(obj)/.x509.list +$(obj)/.x509.list: +	@echo $(X509_CERTIFICATES) >$@ +clean-files := x509_certificate_list .x509.list +endif + +ifeq ($(CONFIG_MODULE_SIG),y)  ###############################################################################  #  # If module signing is requested, say by allyesconfig, but a key has not been diff --git a/kernel/audit.c b/kernel/audit.c index 7b0e23a740ce..906ae5a0233a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -60,7 +60,6 @@  #ifdef CONFIG_SECURITY  #include <linux/security.h>  #endif -#include <net/netlink.h>  #include <linux/freezer.h>  #include <linux/tty.h>  #include <linux/pid_namespace.h> @@ -140,6 +139,17 @@ static struct task_struct *kauditd_task;  static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);  static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); +static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, +				   .mask = -1, +				   .features = 0, +				   .lock = 0,}; + +static char *audit_feature_names[2] = { +	"only_unset_loginuid", +	"loginuid_immutable", +}; + +  /* Serialize requests from userspace. */  DEFINE_MUTEX(audit_cmd_mutex); @@ -584,6 +594,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)  		return -EOPNOTSUPP;  	case AUDIT_GET:  	case AUDIT_SET: +	case AUDIT_GET_FEATURE: +	case AUDIT_SET_FEATURE:  	case AUDIT_LIST_RULES:  	case AUDIT_ADD_RULE:  	case AUDIT_DEL_RULE: @@ -613,7 +625,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)  	int rc = 0;  	uid_t uid = from_kuid(&init_user_ns, current_uid()); -	if (!audit_enabled) { +	if (!audit_enabled && msg_type != AUDIT_USER_AVC) {  		*ab = NULL;  		return rc;  	} @@ -628,6 +640,94 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)  	return rc;  } +int is_audit_feature_set(int i) +{ +	return af.features & AUDIT_FEATURE_TO_MASK(i); +} + + +static int audit_get_feature(struct sk_buff *skb) +{ +	u32 seq; + +	seq = nlmsg_hdr(skb)->nlmsg_seq; + +	audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, +			 &af, sizeof(af)); + +	return 0; +} + +static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature, +				     u32 old_lock, u32 new_lock, int res) +{ +	struct audit_buffer *ab; + +	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); +	audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d", +			 audit_feature_names[which], !!old_feature, !!new_feature, +			 !!old_lock, !!new_lock, res); +	audit_log_end(ab); +} + +static int audit_set_feature(struct sk_buff *skb) +{ +	struct audit_features *uaf; +	int i; + +	BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0])); +	uaf = nlmsg_data(nlmsg_hdr(skb)); + +	/* if there is ever a version 2 we should handle that here */ + +	for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { +		u32 feature = AUDIT_FEATURE_TO_MASK(i); +		u32 old_feature, new_feature, old_lock, new_lock; + +		/* if we are not changing this feature, move along */ +		if (!(feature & uaf->mask)) +			continue; + +		old_feature = af.features & feature; +		new_feature = uaf->features & feature; +		new_lock = (uaf->lock | af.lock) & feature; +		old_lock = af.lock & feature; + +		/* are we changing a locked feature? */ +		if ((af.lock & feature) && (new_feature != old_feature)) { +			audit_log_feature_change(i, old_feature, new_feature, +						 old_lock, new_lock, 0); +			return -EPERM; +		} +	} +	/* nothing invalid, do the changes */ +	for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { +		u32 feature = AUDIT_FEATURE_TO_MASK(i); +		u32 old_feature, new_feature, old_lock, new_lock; + +		/* if we are not changing this feature, move along */ +		if (!(feature & uaf->mask)) +			continue; + +		old_feature = af.features & feature; +		new_feature = uaf->features & feature; +		old_lock = af.lock & feature; +		new_lock = (uaf->lock | af.lock) & feature; + +		if (new_feature != old_feature) +			audit_log_feature_change(i, old_feature, new_feature, +						 old_lock, new_lock, 1); + +		if (new_feature) +			af.features |= feature; +		else +			af.features &= ~feature; +		af.lock |= new_lock; +	} + +	return 0; +} +  static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  {  	u32			seq; @@ -659,6 +759,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  	switch (msg_type) {  	case AUDIT_GET: +		memset(&status_set, 0, sizeof(status_set));  		status_set.enabled	 = audit_enabled;  		status_set.failure	 = audit_failure;  		status_set.pid		 = audit_pid; @@ -670,7 +771,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  				 &status_set, sizeof(status_set));  		break;  	case AUDIT_SET: -		if (nlh->nlmsg_len < sizeof(struct audit_status)) +		if (nlmsg_len(nlh) < sizeof(struct audit_status))  			return -EINVAL;  		status_get   = (struct audit_status *)data;  		if (status_get->mask & AUDIT_STATUS_ENABLED) { @@ -699,6 +800,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)  			err = audit_set_backlog_limit(status_get->backlog_limit);  		break; +	case AUDIT_GET_FEATURE: +		err = audit_get_feature(skb); +		if (err) +			return err; +		break; +	case AUDIT_SET_FEATURE: +		err = audit_set_feature(skb); +		if (err) +			return err; +		break;  	case AUDIT_USER:  	case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:  	case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: @@ -715,7 +826,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  			}  			audit_log_common_recv_msg(&ab, msg_type);  			if (msg_type != AUDIT_USER_TTY) -				audit_log_format(ab, " msg='%.1024s'", +				audit_log_format(ab, " msg='%.*s'", +						 AUDIT_MESSAGE_TEXT_MAX,  						 (char *)data);  			else {  				int size; @@ -818,7 +930,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		struct task_struct *tsk = current;  		spin_lock(&tsk->sighand->siglock); -		s.enabled = tsk->signal->audit_tty != 0; +		s.enabled = tsk->signal->audit_tty;  		s.log_passwd = tsk->signal->audit_tty_log_passwd;  		spin_unlock(&tsk->sighand->siglock); @@ -832,7 +944,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)  		memset(&s, 0, sizeof(s));  		/* guard against past and future API changes */ -		memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len)); +		memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));  		if ((s.enabled != 0 && s.enabled != 1) ||  		    (s.log_passwd != 0 && s.log_passwd != 1))  			return -EINVAL; @@ -1067,13 +1179,6 @@ static void wait_for_auditd(unsigned long sleep_time)  	remove_wait_queue(&audit_backlog_wait, &wait);  } -/* Obtain an audit buffer.  This routine does locking to obtain the - * audit buffer, but then no locking is required for calls to - * audit_log_*format.  If the tsk is a task that is currently in a - * syscall, then the syscall is marked as auditable and an audit record - * will be written at syscall exit.  If there is no associated task, tsk - * should be NULL. */ -  /**   * audit_log_start - obtain an audit buffer   * @ctx: audit_context (may be NULL) @@ -1389,7 +1494,7 @@ void audit_log_session_info(struct audit_buffer *ab)  	u32 sessionid = audit_get_sessionid(current);  	uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); -	audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid); +	audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);  }  void audit_log_key(struct audit_buffer *ab, char *key) @@ -1536,6 +1641,26 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,  		}  	} +	/* log the audit_names record type */ +	audit_log_format(ab, " nametype="); +	switch(n->type) { +	case AUDIT_TYPE_NORMAL: +		audit_log_format(ab, "NORMAL"); +		break; +	case AUDIT_TYPE_PARENT: +		audit_log_format(ab, "PARENT"); +		break; +	case AUDIT_TYPE_CHILD_DELETE: +		audit_log_format(ab, "DELETE"); +		break; +	case AUDIT_TYPE_CHILD_CREATE: +		audit_log_format(ab, "CREATE"); +		break; +	default: +		audit_log_format(ab, "UNKNOWN"); +		break; +	} +  	audit_log_fcaps(ab, n);  	audit_log_end(ab);  } diff --git a/kernel/audit.h b/kernel/audit.h index 123c9b7c3979..b779642b29af 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -197,6 +197,9 @@ struct audit_context {  			int			fd;  			int			flags;  		} mmap; +		struct { +			int			argc; +		} execve;  	};  	int fds[2]; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f7aee8be7fb2..51f3fd4c1ed3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -343,6 +343,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)  	case AUDIT_DEVMINOR:  	case AUDIT_EXIT:  	case AUDIT_SUCCESS: +	case AUDIT_INODE:  		/* bit ops are only useful on syscall args */  		if (f->op == Audit_bitmask || f->op == Audit_bittest)  			return -EINVAL; @@ -423,7 +424,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,  		f->lsm_rule = NULL;  		/* Support legacy tests for a valid loginuid */ -		if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) { +		if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {  			f->type = AUDIT_LOGINUID_SET;  			f->val = 0;  		} diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9845cb32b60a..90594c9f7552 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -95,13 +95,6 @@ struct audit_aux_data {  /* Number of target pids per aux struct. */  #define AUDIT_AUX_PIDS	16 -struct audit_aux_data_execve { -	struct audit_aux_data	d; -	int argc; -	int envc; -	struct mm_struct *mm; -}; -  struct audit_aux_data_pids {  	struct audit_aux_data	d;  	pid_t			target_pid[AUDIT_AUX_PIDS]; @@ -121,12 +114,6 @@ struct audit_aux_data_bprm_fcaps {  	struct audit_cap_data	new_pcap;  }; -struct audit_aux_data_capset { -	struct audit_aux_data	d; -	pid_t			pid; -	struct audit_cap_data	cap; -}; -  struct audit_tree_refs {  	struct audit_tree_refs *next;  	struct audit_chunk *c[31]; @@ -566,7 +553,7 @@ static int audit_filter_rules(struct task_struct *tsk,  			break;  		case AUDIT_INODE:  			if (name) -				result = (name->ino == f->val); +				result = audit_comparator(name->ino, f->op, f->val);  			else if (ctx) {  				list_for_each_entry(n, &ctx->names_list, list) {  					if (audit_comparator(n->ino, f->op, f->val)) { @@ -943,8 +930,10 @@ int audit_alloc(struct task_struct *tsk)  		return 0; /* Return if not auditing. */  	state = audit_filter_task(tsk, &key); -	if (state == AUDIT_DISABLED) +	if (state == AUDIT_DISABLED) { +		clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);  		return 0; +	}  	if (!(context = audit_alloc_context(state))) {  		kfree(key); @@ -1149,20 +1138,16 @@ static int audit_log_single_execve_arg(struct audit_context *context,  }  static void audit_log_execve_info(struct audit_context *context, -				  struct audit_buffer **ab, -				  struct audit_aux_data_execve *axi) +				  struct audit_buffer **ab)  {  	int i, len;  	size_t len_sent = 0;  	const char __user *p;  	char *buf; -	if (axi->mm != current->mm) -		return; /* execve failed, no additional info */ - -	p = (const char __user *)axi->mm->arg_start; +	p = (const char __user *)current->mm->arg_start; -	audit_log_format(*ab, "argc=%d", axi->argc); +	audit_log_format(*ab, "argc=%d", context->execve.argc);  	/*  	 * we need some kernel buffer to hold the userspace args.  Just @@ -1176,7 +1161,7 @@ static void audit_log_execve_info(struct audit_context *context,  		return;  	} -	for (i = 0; i < axi->argc; i++) { +	for (i = 0; i < context->execve.argc; i++) {  		len = audit_log_single_execve_arg(context, ab, i,  						  &len_sent, p, buf);  		if (len <= 0) @@ -1279,6 +1264,9 @@ static void show_special(struct audit_context *context, int *call_panic)  		audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,  				 context->mmap.flags);  		break; } +	case AUDIT_EXECVE: { +		audit_log_execve_info(context, &ab); +		break; }  	}  	audit_log_end(ab);  } @@ -1325,11 +1313,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts  		switch (aux->type) { -		case AUDIT_EXECVE: { -			struct audit_aux_data_execve *axi = (void *)aux; -			audit_log_execve_info(context, &ab, axi); -			break; } -  		case AUDIT_BPRM_FCAPS: {  			struct audit_aux_data_bprm_fcaps *axs = (void *)aux;  			audit_log_format(ab, "fver=%x", axs->fcap_ver); @@ -1964,6 +1947,43 @@ int auditsc_get_stamp(struct audit_context *ctx,  /* global counter which is incremented every time something logs in */  static atomic_t session_id = ATOMIC_INIT(0); +static int audit_set_loginuid_perm(kuid_t loginuid) +{ +	/* if we are unset, we don't need privs */ +	if (!audit_loginuid_set(current)) +		return 0; +	/* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ +	if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) +		return -EPERM; +	/* it is set, you need permission */ +	if (!capable(CAP_AUDIT_CONTROL)) +		return -EPERM; +	/* reject if this is not an unset and we don't allow that */ +	if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) +		return -EPERM; +	return 0; +} + +static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, +				   unsigned int oldsessionid, unsigned int sessionid, +				   int rc) +{ +	struct audit_buffer *ab; +	uid_t uid, ologinuid, nloginuid; + +	uid = from_kuid(&init_user_ns, task_uid(current)); +	ologinuid = from_kuid(&init_user_ns, koldloginuid); +	nloginuid = from_kuid(&init_user_ns, kloginuid), + +	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); +	if (!ab) +		return; +	audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old " +			 "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid, +			 nloginuid, oldsessionid, sessionid, !rc); +	audit_log_end(ab); +} +  /**   * audit_set_loginuid - set current task's audit_context loginuid   * @loginuid: loginuid value @@ -1975,37 +1995,26 @@ static atomic_t session_id = ATOMIC_INIT(0);  int audit_set_loginuid(kuid_t loginuid)  {  	struct task_struct *task = current; -	struct audit_context *context = task->audit_context; -	unsigned int sessionid; +	unsigned int oldsessionid, sessionid = (unsigned int)-1; +	kuid_t oldloginuid; +	int rc; -#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE -	if (audit_loginuid_set(task)) -		return -EPERM; -#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ -	if (!capable(CAP_AUDIT_CONTROL)) -		return -EPERM; -#endif  /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ +	oldloginuid = audit_get_loginuid(current); +	oldsessionid = audit_get_sessionid(current); -	sessionid = atomic_inc_return(&session_id); -	if (context && context->in_syscall) { -		struct audit_buffer *ab; +	rc = audit_set_loginuid_perm(loginuid); +	if (rc) +		goto out; + +	/* are we setting or clearing? */ +	if (uid_valid(loginuid)) +		sessionid = atomic_inc_return(&session_id); -		ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); -		if (ab) { -			audit_log_format(ab, "login pid=%d uid=%u " -				"old auid=%u new auid=%u" -				" old ses=%u new ses=%u", -				task->pid, -				from_kuid(&init_user_ns, task_uid(task)), -				from_kuid(&init_user_ns, task->loginuid), -				from_kuid(&init_user_ns, loginuid), -				task->sessionid, sessionid); -			audit_log_end(ab); -		} -	}  	task->sessionid = sessionid;  	task->loginuid = loginuid; -	return 0; +out: +	audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); +	return rc;  }  /** @@ -2126,22 +2135,12 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo  	context->ipc.has_perm = 1;  } -int __audit_bprm(struct linux_binprm *bprm) +void __audit_bprm(struct linux_binprm *bprm)  { -	struct audit_aux_data_execve *ax;  	struct audit_context *context = current->audit_context; -	ax = kmalloc(sizeof(*ax), GFP_KERNEL); -	if (!ax) -		return -ENOMEM; - -	ax->argc = bprm->argc; -	ax->envc = bprm->envc; -	ax->mm = bprm->mm; -	ax->d.type = AUDIT_EXECVE; -	ax->d.next = context->aux; -	context->aux = (void *)ax; -	return 0; +	context->type = AUDIT_EXECVE; +	context->execve.argc = bprm->argc;  } diff --git a/kernel/bounds.c b/kernel/bounds.c index 0c9b862292b2..5253204afdca 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -10,6 +10,8 @@  #include <linux/mmzone.h>  #include <linux/kbuild.h>  #include <linux/page_cgroup.h> +#include <linux/log2.h> +#include <linux/spinlock_types.h>  void foo(void)  { @@ -17,5 +19,9 @@ void foo(void)  	DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);  	DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);  	DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); +#ifdef CONFIG_SMP +	DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); +#endif +	DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int));  	/* End of constants */  } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2418b6e71a85..8b729c278b64 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex);  static DEFINE_MUTEX(cgroup_root_mutex);  /* + * cgroup destruction makes heavy use of work items and there can be a lot + * of concurrent destructions.  Use a separate workqueue so that cgroup + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_destroy_wq; + +/*   * Generate an array of cgroup subsystem pointers. At boot time, this is   * populated with the built in subsystems, and modular subsystems are   * registered after that. The mutable section of this array is protected by @@ -125,38 +133,6 @@ struct cfent {  };  /* - * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when - * cgroup_subsys->use_id != 0. - */ -#define CSS_ID_MAX	(65535) -struct css_id { -	/* -	 * The css to which this ID points. This pointer is set to valid value -	 * after cgroup is populated. If cgroup is removed, this will be NULL. -	 * This pointer is expected to be RCU-safe because destroy() -	 * is called after synchronize_rcu(). But for safe use, css_tryget() -	 * should be used for avoiding race. -	 */ -	struct cgroup_subsys_state __rcu *css; -	/* -	 * ID of this css. -	 */ -	unsigned short id; -	/* -	 * Depth in hierarchy which this ID belongs to. -	 */ -	unsigned short depth; -	/* -	 * ID is freed by RCU. (and lookup routine is RCU safe.) -	 */ -	struct rcu_head rcu_head; -	/* -	 * Hierarchy of CSS ID belongs to. -	 */ -	unsigned short stack[0]; /* Array of Length (depth+1) */ -}; - -/*   * cgroup_event represents events which userspace want to receive.   */  struct cgroup_event { @@ -223,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);  static int cgroup_destroy_locked(struct cgroup *cgrp);  static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],  			      bool is_add); +static int cgroup_file_release(struct inode *inode, struct file *file);  /**   * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -387,9 +364,6 @@ struct cgrp_cset_link {  static struct css_set init_css_set;  static struct cgrp_cset_link init_cgrp_cset_link; -static int cgroup_init_idr(struct cgroup_subsys *ss, -			   struct cgroup_subsys_state *css); -  /*   * css_set_lock protects the list of css_set objects, and the chain of   * tasks off each css_set.  Nests outside task->alloc_lock due to @@ -841,8 +815,6 @@ static struct backing_dev_info cgroup_backing_dev_info = {  	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,  }; -static int alloc_css_id(struct cgroup_subsys_state *child_css); -  static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)  {  	struct inode *inode = new_inode(sb); @@ -908,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head)  	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);  	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); -	schedule_work(&cgrp->destroy_work); +	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);  }  static void cgroup_diput(struct dentry *dentry, struct inode *inode) @@ -932,11 +904,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)  	iput(inode);  } -static int cgroup_delete(const struct dentry *d) -{ -	return 1; -} -  static void remove_dir(struct dentry *d)  {  	struct dentry *parent = dget(d->d_parent); @@ -1523,7 +1490,7 @@ static int cgroup_get_rootdir(struct super_block *sb)  {  	static const struct dentry_operations cgroup_dops = {  		.d_iput = cgroup_diput, -		.d_delete = cgroup_delete, +		.d_delete = always_delete_dentry,  	};  	struct inode *inode = @@ -2039,7 +2006,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,  		/* @tsk either already exited or can't exit until the end */  		if (tsk->flags & PF_EXITING) -			continue; +			goto next;  		/* as per above, nr_threads may decrease, but not increase. */  		BUG_ON(i >= group_size); @@ -2047,7 +2014,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,  		ent.cgrp = task_cgroup_from_root(tsk, root);  		/* nothing to do if this task is already in the cgroup */  		if (ent.cgrp == cgrp) -			continue; +			goto next;  		/*  		 * saying GFP_ATOMIC has no effect here because we did prealloc  		 * earlier, but it's good form to communicate our expectations. @@ -2055,7 +2022,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,  		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);  		BUG_ON(retval != 0);  		i++; - +	next:  		if (!threadgroup)  			break;  	} while_each_thread(leader, tsk); @@ -2463,7 +2430,7 @@ static const struct file_operations cgroup_seqfile_operations = {  	.read = seq_read,  	.write = cgroup_file_write,  	.llseek = seq_lseek, -	.release = single_release, +	.release = cgroup_file_release,  };  static int cgroup_file_open(struct inode *inode, struct file *file) @@ -2524,6 +2491,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file)  		ret = cft->release(inode, file);  	if (css->ss)  		css_put(css); +	if (file->f_op == &cgroup_seqfile_operations) +		single_release(inode, file);  	return ret;  } @@ -3188,11 +3157,9 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,  	WARN_ON_ONCE(!rcu_read_lock_held()); -	/* if first iteration, visit the leftmost descendant */ -	if (!pos) { -		next = css_leftmost_descendant(root); -		return next != root ? next : NULL; -	} +	/* if first iteration, visit leftmost descendant which may be @root */ +	if (!pos) +		return css_leftmost_descendant(root);  	/* if we visited @root, we're done */  	if (pos == root) @@ -4242,21 +4209,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)  				goto err;  		}  	} - -	/* This cgroup is ready now */ -	for_each_root_subsys(cgrp->root, ss) { -		struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); -		struct css_id *id = rcu_dereference_protected(css->id, true); - -		/* -		 * Update id->css pointer and make this css visible from -		 * CSS ID functions. This pointer will be dereferened -		 * from RCU-read-side without locks. -		 */ -		if (id) -			rcu_assign_pointer(id->css, css); -	} -  	return 0;  err:  	cgroup_clear_dir(cgrp, subsys_mask); @@ -4308,7 +4260,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)  	 * css_put().  dput() requires process context which we don't have.  	 */  	INIT_WORK(&css->destroy_work, css_free_work_fn); -	schedule_work(&css->destroy_work); +	queue_work(cgroup_destroy_wq, &css->destroy_work);  }  static void css_release(struct percpu_ref *ref) @@ -4325,7 +4277,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,  	css->cgroup = cgrp;  	css->ss = ss;  	css->flags = 0; -	css->id = NULL;  	if (cgrp->parent)  		css->parent = cgroup_css(cgrp->parent, ss); @@ -4457,12 +4408,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  			goto err_free_all;  		init_css(css, ss, cgrp); - -		if (ss->use_id) { -			err = alloc_css_id(css); -			if (err) -				goto err_free_all; -		}  	}  	/* @@ -4605,7 +4550,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)  		container_of(ref, struct cgroup_subsys_state, refcnt);  	INIT_WORK(&css->destroy_work, css_killed_work_fn); -	schedule_work(&css->destroy_work); +	queue_work(cgroup_destroy_wq, &css->destroy_work);  }  /** @@ -4927,12 +4872,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)  	/* our new subsystem will be attached to the dummy hierarchy. */  	init_css(css, ss, cgroup_dummy_top); -	/* init_idr must be after init_css() because it sets css->id. */ -	if (ss->use_id) { -		ret = cgroup_init_idr(ss, css); -		if (ret) -			goto err_unload; -	}  	/*  	 * Now we need to entangle the css into the existing css_sets. unlike @@ -4998,9 +4937,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)  	offline_css(cgroup_css(cgroup_dummy_top, ss)); -	if (ss->use_id) -		idr_destroy(&ss->idr); -  	/* deassign the subsys_id */  	cgroup_subsys[ss->subsys_id] = NULL; @@ -5027,8 +4963,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)  	/*  	 * remove subsystem's css from the cgroup_dummy_top and free it -  	 * need to free before marking as null because ss->css_free needs -	 * the cgrp->subsys pointer to find their state. note that this -	 * also takes care of freeing the css_id. +	 * the cgrp->subsys pointer to find their state.  	 */  	ss->css_free(cgroup_css(cgroup_dummy_top, ss));  	RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); @@ -5099,8 +5034,6 @@ int __init cgroup_init(void)  	for_each_builtin_subsys(ss, i) {  		if (!ss->early_init)  			cgroup_init_subsys(ss); -		if (ss->use_id) -			cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);  	}  	/* allocate id for the dummy hierarchy */ @@ -5141,6 +5074,22 @@ out:  	return err;  } +static int __init cgroup_wq_init(void) +{ +	/* +	 * There isn't much point in executing destruction path in +	 * parallel.  Good chunk is serialized with cgroup_mutex anyway. +	 * Use 1 for @max_active. +	 * +	 * We would prefer to do this in cgroup_init() above, but that +	 * is called before init_workqueues(): so leave this until after. +	 */ +	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); +	BUG_ON(!cgroup_destroy_wq); +	return 0; +} +core_initcall(cgroup_wq_init); +  /*   * proc_cgroup_show()   *  - Print task's cgroup paths into seq_file, one line for each hierarchy @@ -5520,181 +5469,6 @@ static int __init cgroup_disable(char *str)  }  __setup("cgroup_disable=", cgroup_disable); -/* - * Functons for CSS ID. - */ - -/* to get ID other than 0, this should be called when !cgroup_is_dead() */ -unsigned short css_id(struct cgroup_subsys_state *css) -{ -	struct css_id *cssid; - -	/* -	 * This css_id() can return correct value when somone has refcnt -	 * on this or this is under rcu_read_lock(). Once css->id is allocated, -	 * it's unchanged until freed. -	 */ -	cssid = rcu_dereference_raw(css->id); - -	if (cssid) -		return cssid->id; -	return 0; -} -EXPORT_SYMBOL_GPL(css_id); - -/** - *  css_is_ancestor - test "root" css is an ancestor of "child" - * @child: the css to be tested. - * @root: the css supporsed to be an ancestor of the child. - * - * Returns true if "root" is an ancestor of "child" in its hierarchy. Because - * this function reads css->id, the caller must hold rcu_read_lock(). - * But, considering usual usage, the csses should be valid objects after test. - * Assuming that the caller will do some action to the child if this returns - * returns true, the caller must take "child";s reference count. - * If "child" is valid object and this returns true, "root" is valid, too. - */ - -bool css_is_ancestor(struct cgroup_subsys_state *child, -		    const struct cgroup_subsys_state *root) -{ -	struct css_id *child_id; -	struct css_id *root_id; - -	child_id  = rcu_dereference(child->id); -	if (!child_id) -		return false; -	root_id = rcu_dereference(root->id); -	if (!root_id) -		return false; -	if (child_id->depth < root_id->depth) -		return false; -	if (child_id->stack[root_id->depth] != root_id->id) -		return false; -	return true; -} - -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) -{ -	struct css_id *id = rcu_dereference_protected(css->id, true); - -	/* When this is called before css_id initialization, id can be NULL */ -	if (!id) -		return; - -	BUG_ON(!ss->use_id); - -	rcu_assign_pointer(id->css, NULL); -	rcu_assign_pointer(css->id, NULL); -	spin_lock(&ss->id_lock); -	idr_remove(&ss->idr, id->id); -	spin_unlock(&ss->id_lock); -	kfree_rcu(id, rcu_head); -} -EXPORT_SYMBOL_GPL(free_css_id); - -/* - * This is called by init or create(). Then, calls to this function are - * always serialized (By cgroup_mutex() at create()). - */ - -static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) -{ -	struct css_id *newid; -	int ret, size; - -	BUG_ON(!ss->use_id); - -	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); -	newid = kzalloc(size, GFP_KERNEL); -	if (!newid) -		return ERR_PTR(-ENOMEM); - -	idr_preload(GFP_KERNEL); -	spin_lock(&ss->id_lock); -	/* Don't use 0. allocates an ID of 1-65535 */ -	ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); -	spin_unlock(&ss->id_lock); -	idr_preload_end(); - -	/* Returns error when there are no free spaces for new ID.*/ -	if (ret < 0) -		goto err_out; - -	newid->id = ret; -	newid->depth = depth; -	return newid; -err_out: -	kfree(newid); -	return ERR_PTR(ret); - -} - -static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, -					    struct cgroup_subsys_state *rootcss) -{ -	struct css_id *newid; - -	spin_lock_init(&ss->id_lock); -	idr_init(&ss->idr); - -	newid = get_new_cssid(ss, 0); -	if (IS_ERR(newid)) -		return PTR_ERR(newid); - -	newid->stack[0] = newid->id; -	RCU_INIT_POINTER(newid->css, rootcss); -	RCU_INIT_POINTER(rootcss->id, newid); -	return 0; -} - -static int alloc_css_id(struct cgroup_subsys_state *child_css) -{ -	struct cgroup_subsys_state *parent_css = css_parent(child_css); -	struct css_id *child_id, *parent_id; -	int i, depth; - -	parent_id = rcu_dereference_protected(parent_css->id, true); -	depth = parent_id->depth + 1; - -	child_id = get_new_cssid(child_css->ss, depth); -	if (IS_ERR(child_id)) -		return PTR_ERR(child_id); - -	for (i = 0; i < depth; i++) -		child_id->stack[i] = parent_id->stack[i]; -	child_id->stack[depth] = child_id->id; -	/* -	 * child_id->css pointer will be set after this cgroup is available -	 * see cgroup_populate_dir() -	 */ -	rcu_assign_pointer(child_css->id, child_id); - -	return 0; -} - -/** - * css_lookup - lookup css by id - * @ss: cgroup subsys to be looked into. - * @id: the id - * - * Returns pointer to cgroup_subsys_state if there is valid one with id. - * NULL if not. Should be called under rcu_read_lock() - */ -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) -{ -	struct css_id *cssid = NULL; - -	BUG_ON(!ss->use_id); -	cssid = idr_find(&ss->idr, id); - -	if (unlikely(!cssid)) -		return NULL; - -	return rcu_dereference(cssid->css); -} -EXPORT_SYMBOL_GPL(css_lookup); -  /**   * css_from_dir - get corresponding css from the dentry of a cgroup dir   * @dentry: directory dentry of interest diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 859c8dfd78a1..e5f3917aa05b 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -120,7 +120,7 @@ void context_tracking_user_enter(void)   * instead of preempt_schedule() to exit user context if needed before   * calling the scheduler.   */ -void __sched notrace preempt_schedule_context(void) +asmlinkage void __sched notrace preempt_schedule_context(void)  {  	enum ctx_state prev_ctx; diff --git a/kernel/cpu.c b/kernel/cpu.c index d7f07a2da5a6..deff2e693766 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -306,8 +306,28 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)  				__func__, cpu);  		goto out_release;  	} + +	/* +	 * By now we've cleared cpu_active_mask, wait for all preempt-disabled +	 * and RCU users of this state to go away such that all new such users +	 * will observe it. +	 * +	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might +	 * not imply sync_sched(), so explicitly call both. +	 * +	 * Do sync before park smpboot threads to take care the rcu boost case. +	 */ +#ifdef CONFIG_PREEMPT +	synchronize_sched(); +#endif +	synchronize_rcu(); +  	smpboot_park_threads(cpu); +	/* +	 * So now all preempt/rcu users must observe !cpu_active(). +	 */ +  	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));  	if (err) {  		/* CPU didn't die: tell everyone.  Can't complain. */ @@ -420,11 +440,6 @@ int cpu_up(unsigned int cpu)  {  	int err = 0; -#ifdef	CONFIG_MEMORY_HOTPLUG -	int nid; -	pg_data_t	*pgdat; -#endif -  	if (!cpu_possible(cpu)) {  		printk(KERN_ERR "can't online cpu %d because it is not "  			"configured as may-hotadd at boot time\n", cpu); @@ -435,27 +450,9 @@ int cpu_up(unsigned int cpu)  		return -EINVAL;  	} -#ifdef	CONFIG_MEMORY_HOTPLUG -	nid = cpu_to_node(cpu); -	if (!node_online(nid)) { -		err = mem_online_node(nid); -		if (err) -			return err; -	} - -	pgdat = NODE_DATA(nid); -	if (!pgdat) { -		printk(KERN_ERR -			"Can't online cpu %d due to NULL pgdat\n", cpu); -		return -ENOMEM; -	} - -	if (pgdat->node_zonelists->_zonerefs->zone == NULL) { -		mutex_lock(&zonelists_mutex); -		build_all_zonelists(NULL, NULL); -		mutex_unlock(&zonelists_mutex); -	} -#endif +	err = try_online_node(cpu_to_node(cpu)); +	if (err) +		return err;  	cpu_maps_update_begin(); diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index e695c0a0bcb5..988573a9a387 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)  	rcu_idle_enter();  	trace_cpu_idle_rcuidle(0, smp_processor_id());  	local_irq_enable(); -	while (!need_resched()) +	while (!tif_need_resched())  		cpu_relax();  	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());  	rcu_idle_exit(); @@ -92,8 +92,7 @@ static void cpu_idle_loop(void)  			if (cpu_idle_force_poll || tick_check_broadcast_expired()) {  				cpu_idle_poll();  			} else { -				current_clr_polling(); -				if (!need_resched()) { +				if (!current_clr_polling_and_test()) {  					stop_critical_timings();  					rcu_idle_enter();  					arch_cpu_idle(); @@ -103,9 +102,16 @@ static void cpu_idle_loop(void)  				} else {  					local_irq_enable();  				} -				current_set_polling(); +				__current_set_polling();  			}  			arch_cpu_idle_exit(); +			/* +			 * We need to test and propagate the TIF_NEED_RESCHED +			 * bit here because we might not have send the +			 * reschedule IPI to idle tasks. +			 */ +			if (tif_need_resched()) +				set_preempt_need_resched();  		}  		tick_nohz_idle_exit();  		schedule_preempt_disabled(); @@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)  	 */  	boot_init_stack_canary();  #endif -	current_set_polling(); +	__current_set_polling();  	arch_cpu_idle_prepare();  	cpu_idle_loop();  } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6bf981e13c43..4772034b4b17 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,  	need_loop = task_has_mempolicy(tsk) ||  			!nodes_intersects(*newmems, tsk->mems_allowed); -	if (need_loop) +	if (need_loop) { +		local_irq_disable();  		write_seqcount_begin(&tsk->mems_allowed_seq); +	}  	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); @@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);  	tsk->mems_allowed = *newmems; -	if (need_loop) +	if (need_loop) {  		write_seqcount_end(&tsk->mems_allowed_seq); +		local_irq_enable(); +	}  	task_unlock(tsk);  } diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0506d447aed2..7d2f35e5df2f 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -575,8 +575,12 @@ return_normal:  		raw_spin_lock(&dbg_slave_lock);  #ifdef CONFIG_SMP +	/* If send_ready set, slaves are already waiting */ +	if (ks->send_ready) +		atomic_set(ks->send_ready, 1); +  	/* Signal the other CPUs to enter kgdb_wait() */ -	if ((!kgdb_single_step) && kgdb_do_roundup) +	else if ((!kgdb_single_step) && kgdb_do_roundup)  		kgdb_roundup_cpus(flags);  #endif @@ -678,11 +682,11 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)  	if (arch_kgdb_ops.enable_nmi)  		arch_kgdb_ops.enable_nmi(0); +	memset(ks, 0, sizeof(struct kgdb_state));  	ks->cpu			= raw_smp_processor_id();  	ks->ex_vector		= evector;  	ks->signo		= signo;  	ks->err_code		= ecode; -	ks->kgdb_usethreadid	= 0;  	ks->linux_regs		= regs;  	if (kgdb_reenter_check(ks)) @@ -732,6 +736,30 @@ int kgdb_nmicallback(int cpu, void *regs)  	return 1;  } +int kgdb_nmicallin(int cpu, int trapnr, void *regs, atomic_t *send_ready) +{ +#ifdef CONFIG_SMP +	if (!kgdb_io_ready(0) || !send_ready) +		return 1; + +	if (kgdb_info[cpu].enter_kgdb == 0) { +		struct kgdb_state kgdb_var; +		struct kgdb_state *ks = &kgdb_var; + +		memset(ks, 0, sizeof(struct kgdb_state)); +		ks->cpu			= cpu; +		ks->ex_vector		= trapnr; +		ks->signo		= SIGTRAP; +		ks->err_code		= KGDB_KDB_REASON_SYSTEM_NMI; +		ks->linux_regs		= regs; +		ks->send_ready		= send_ready; +		kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); +		return 0; +	} +#endif +	return 1; +} +  static void kgdb_console_write(struct console *co, const char *s,     unsigned count)  { diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 2235967e78b0..572aa4f5677c 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -26,6 +26,7 @@ struct kgdb_state {  	unsigned long		threadid;  	long			kgdb_usethreadid;  	struct pt_regs		*linux_regs; +	atomic_t		*send_ready;  };  /* Exception state values */ @@ -74,11 +75,13 @@ extern int kdb_stub(struct kgdb_state *ks);  extern int kdb_parse(const char *cmdstr);  extern int kdb_common_init_state(struct kgdb_state *ks);  extern int kdb_common_deinit_state(void); +#define KGDB_KDB_REASON_SYSTEM_NMI KDB_REASON_SYSTEM_NMI  #else /* ! CONFIG_KGDB_KDB */  static inline int kdb_stub(struct kgdb_state *ks)  {  	return DBG_PASS_EVENT;  } +#define KGDB_KDB_REASON_SYSTEM_NMI 0  #endif /* CONFIG_KGDB_KDB */  #endif /* _DEBUG_CORE_H_ */ diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 328d18ef31e4..8859ca34dcfe 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -69,7 +69,10 @@ int kdb_stub(struct kgdb_state *ks)  	if (atomic_read(&kgdb_setting_breakpoint))  		reason = KDB_REASON_KEYBOARD; -	if (in_nmi()) +	if (ks->err_code == KDB_REASON_SYSTEM_NMI && ks->signo == SIGTRAP) +		reason = KDB_REASON_SYSTEM_NMI; + +	else if (in_nmi())  		reason = KDB_REASON_NMI;  	for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 00eb8f7fbf41..0b097c8a1e50 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1200,6 +1200,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,  			   instruction_pointer(regs));  		kdb_dumpregs(regs);  		break; +	case KDB_REASON_SYSTEM_NMI: +		kdb_printf("due to System NonMaskable Interrupt\n"); +		break;  	case KDB_REASON_NMI:  		kdb_printf("due to NonMaskable Interrupt @ "  			   kdb_machreg_fmt "\n", diff --git a/kernel/delayacct.c b/kernel/delayacct.c index d473988c1d0b..54996b71e66d 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -108,12 +108,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)  	struct timespec ts;  	cputime_t utime, stime, stimescaled, utimescaled; -	/* Though tsk->delays accessed later, early exit avoids -	 * unnecessary returning of other data -	 */ -	if (!tsk->delays) -		goto done; -  	tmp = (s64)d->cpu_run_real_total;  	task_cputime(tsk, &utime, &stime);  	cputime_to_timespec(utime + stime, &ts); @@ -158,7 +152,6 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)  	d->freepages_count += tsk->delays->freepages_count;  	spin_unlock_irqrestore(&tsk->delays->lock, flags); -done:  	return 0;  } diff --git a/kernel/elfcore.c b/kernel/elfcore.c index ff915efef66d..e556751d15d9 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -1,23 +1,19 @@  #include <linux/elf.h>  #include <linux/fs.h>  #include <linux/mm.h> - -#include <asm/elf.h> - +#include <linux/binfmts.h>  Elf_Half __weak elf_core_extra_phdrs(void)  {  	return 0;  } -int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, -				      unsigned long limit) +int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)  {  	return 1;  } -int __weak elf_core_write_extra_data(struct file *file, size_t *size, -				     unsigned long limit) +int __weak elf_core_write_extra_data(struct coredump_params *cprm)  {  	return 1;  } diff --git a/kernel/events/core.c b/kernel/events/core.c index d49a9d29334c..d724e7757cd1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;  static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);  static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS; -static atomic_t perf_sample_allowed_ns __read_mostly = -	ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); +static int perf_sample_allowed_ns __read_mostly = +	DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;  void update_perf_cpu_limits(void)  { @@ -184,7 +184,7 @@ void update_perf_cpu_limits(void)  	tmp *= sysctl_perf_cpu_time_max_percent;  	do_div(tmp, 100); -	atomic_set(&perf_sample_allowed_ns, tmp); +	ACCESS_ONCE(perf_sample_allowed_ns) = tmp;  }  static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,  		void __user *buffer, size_t *lenp,  		loff_t *ppos)  { -	int ret = proc_dointvec(table, write, buffer, lenp, ppos); +	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);  	if (ret || !write)  		return ret; @@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,   * we detect that events are taking too long.   */  #define NR_ACCUMULATED_SAMPLES 128 -DEFINE_PER_CPU(u64, running_sample_length); +static DEFINE_PER_CPU(u64, running_sample_length);  void perf_sample_event_took(u64 sample_len_ns)  {  	u64 avg_local_sample_len;  	u64 local_samples_len; +	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); -	if (atomic_read(&perf_sample_allowed_ns) == 0) +	if (allowed_ns == 0)  		return;  	/* decay the counter by 1 average sample */ @@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)  	 */  	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; -	if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) +	if (avg_local_sample_len <= allowed_ns)  		return;  	if (max_samples_per_tick <= 1) @@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)  	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;  	printk_ratelimited(KERN_WARNING -			"perf samples too long (%lld > %d), lowering " +			"perf samples too long (%lld > %lld), lowering "  			"kernel.perf_event_max_sample_rate to %d\n", -			avg_local_sample_len, -			atomic_read(&perf_sample_allowed_ns), +			avg_local_sample_len, allowed_ns,  			sysctl_perf_event_sample_rate);  	update_perf_cpu_limits(); @@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)  		put_ctx(ctx->parent_ctx);  		ctx->parent_ctx = NULL;  	} +	ctx->generation++;  }  static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) @@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)  	ctx->nr_events++;  	if (event->attr.inherit_stat)  		ctx->nr_stat++; + +	ctx->generation++;  }  /* @@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)  	if (sample_type & PERF_SAMPLE_DATA_SRC)  		size += sizeof(data->data_src.val); +	if (sample_type & PERF_SAMPLE_TRANSACTION) +		size += sizeof(data->txn); +  	event->header_size = size;  } @@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)  	 */  	if (event->state > PERF_EVENT_STATE_OFF)  		event->state = PERF_EVENT_STATE_OFF; + +	ctx->generation++;  }  static void perf_group_detach(struct perf_event *event) @@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,  }  /* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled events. - * If the number of enabled events is the same, then the set - * of enabled events should be the same, because these are both - * inherited contexts, therefore we can't access individual events - * in them directly with an fd; we can only enable/disable all - * events via prctl, or enable/disable all events in a family - * via ioctl, which will have the same effect on both contexts. + * Test whether two contexts are equivalent, i.e. whether they have both been + * cloned from the same version of the same context. + * + * Equivalence is measured using a generation number in the context that is + * incremented on each modification to it; see unclone_ctx(), list_add_event() + * and list_del_event().   */  static int context_equiv(struct perf_event_context *ctx1,  			 struct perf_event_context *ctx2)  { -	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx -		&& ctx1->parent_gen == ctx2->parent_gen -		&& !ctx1->pin_count && !ctx2->pin_count; +	/* Pinning disables the swap optimization */ +	if (ctx1->pin_count || ctx2->pin_count) +		return 0; + +	/* If ctx1 is the parent of ctx2 */ +	if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) +		return 1; + +	/* If ctx2 is the parent of ctx1 */ +	if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) +		return 1; + +	/* +	 * If ctx1 and ctx2 have the same parent; we flatten the parent +	 * hierarchy, see perf_event_init_context(). +	 */ +	if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && +			ctx1->parent_gen == ctx2->parent_gen) +		return 1; + +	/* Unmatched */ +	return 0;  }  static void __perf_event_sync_stat(struct perf_event *event, @@ -2210,9 +2234,6 @@ static void __perf_event_sync_stat(struct perf_event *event,  	perf_event_update_userpage(next_event);  } -#define list_next_entry(pos, member) \ -	list_entry(pos->member.next, typeof(*pos), member) -  static void perf_event_sync_stat(struct perf_event_context *ctx,  				   struct perf_event_context *next_ctx)  { @@ -2244,7 +2265,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,  {  	struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];  	struct perf_event_context *next_ctx; -	struct perf_event_context *parent; +	struct perf_event_context *parent, *next_parent;  	struct perf_cpu_context *cpuctx;  	int do_switch = 1; @@ -2256,10 +2277,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,  		return;  	rcu_read_lock(); -	parent = rcu_dereference(ctx->parent_ctx);  	next_ctx = next->perf_event_ctxp[ctxn]; -	if (parent && next_ctx && -	    rcu_dereference(next_ctx->parent_ctx) == parent) { +	if (!next_ctx) +		goto unlock; + +	parent = rcu_dereference(ctx->parent_ctx); +	next_parent = rcu_dereference(next_ctx->parent_ctx); + +	/* If neither context have a parent context; they cannot be clones. */ +	if (!parent && !next_parent) +		goto unlock; + +	if (next_parent == ctx || next_ctx == parent || next_parent == parent) {  		/*  		 * Looks like the two contexts are clones, so we might be  		 * able to optimize the context switch.  We lock both @@ -2287,6 +2316,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,  		raw_spin_unlock(&next_ctx->lock);  		raw_spin_unlock(&ctx->lock);  	} +unlock:  	rcu_read_unlock();  	if (do_switch) { @@ -4572,6 +4602,9 @@ void perf_output_sample(struct perf_output_handle *handle,  	if (sample_type & PERF_SAMPLE_DATA_SRC)  		perf_output_put(handle, data->data_src.val); +	if (sample_type & PERF_SAMPLE_TRANSACTION) +		perf_output_put(handle, data->txn); +  	if (!event->attr.watermark) {  		int wakeup_events = event->attr.wakeup_events; @@ -5100,27 +5133,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)  	unsigned int size;  	char tmp[16];  	char *buf = NULL; -	const char *name; - -	memset(tmp, 0, sizeof(tmp)); +	char *name;  	if (file) {  		struct inode *inode;  		dev_t dev; + +		buf = kmalloc(PATH_MAX, GFP_KERNEL); +		if (!buf) { +			name = "//enomem"; +			goto cpy_name; +		}  		/* -		 * d_path works from the end of the rb backwards, so we +		 * d_path() works from the end of the rb backwards, so we  		 * need to add enough zero bytes after the string to handle  		 * the 64bit alignment we do later.  		 */ -		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); -		if (!buf) { -			name = strncpy(tmp, "//enomem", sizeof(tmp)); -			goto got_name; -		} -		name = d_path(&file->f_path, buf, PATH_MAX); +		name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));  		if (IS_ERR(name)) { -			name = strncpy(tmp, "//toolong", sizeof(tmp)); -			goto got_name; +			name = "//toolong"; +			goto cpy_name;  		}  		inode = file_inode(vma->vm_file);  		dev = inode->i_sb->s_dev; @@ -5128,34 +5160,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)  		gen = inode->i_generation;  		maj = MAJOR(dev);  		min = MINOR(dev); - +		goto got_name;  	} else { -		if (arch_vma_name(mmap_event->vma)) { -			name = strncpy(tmp, arch_vma_name(mmap_event->vma), -				       sizeof(tmp) - 1); -			tmp[sizeof(tmp) - 1] = '\0'; -			goto got_name; -		} +		name = (char *)arch_vma_name(vma); +		if (name) +			goto cpy_name; -		if (!vma->vm_mm) { -			name = strncpy(tmp, "[vdso]", sizeof(tmp)); -			goto got_name; -		} else if (vma->vm_start <= vma->vm_mm->start_brk && +		if (vma->vm_start <= vma->vm_mm->start_brk &&  				vma->vm_end >= vma->vm_mm->brk) { -			name = strncpy(tmp, "[heap]", sizeof(tmp)); -			goto got_name; -		} else if (vma->vm_start <= vma->vm_mm->start_stack && +			name = "[heap]"; +			goto cpy_name; +		} +		if (vma->vm_start <= vma->vm_mm->start_stack &&  				vma->vm_end >= vma->vm_mm->start_stack) { -			name = strncpy(tmp, "[stack]", sizeof(tmp)); -			goto got_name; +			name = "[stack]"; +			goto cpy_name;  		} -		name = strncpy(tmp, "//anon", sizeof(tmp)); -		goto got_name; +		name = "//anon"; +		goto cpy_name;  	} +cpy_name: +	strlcpy(tmp, name, sizeof(tmp)); +	name = tmp;  got_name: -	size = ALIGN(strlen(name)+1, sizeof(u64)); +	/* +	 * Since our buffer works in 8 byte units we need to align our string +	 * size to a multiple of 8. However, we must guarantee the tail end is +	 * zero'd out to avoid leaking random bits to userspace. +	 */ +	size = strlen(name)+1; +	while (!IS_ALIGNED(size, sizeof(u64))) +		name[size++] = '\0';  	mmap_event->file_name = name;  	mmap_event->file_size = size; @@ -6292,6 +6329,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)  	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);  } +static DEVICE_ATTR_RO(type);  static ssize_t  perf_event_mux_interval_ms_show(struct device *dev, @@ -6336,17 +6374,19 @@ perf_event_mux_interval_ms_store(struct device *dev,  	return count;  } +static DEVICE_ATTR_RW(perf_event_mux_interval_ms); -static struct device_attribute pmu_dev_attrs[] = { -	__ATTR_RO(type), -	__ATTR_RW(perf_event_mux_interval_ms), -	__ATTR_NULL, +static struct attribute *pmu_dev_attrs[] = { +	&dev_attr_type.attr, +	&dev_attr_perf_event_mux_interval_ms.attr, +	NULL,  }; +ATTRIBUTE_GROUPS(pmu_dev);  static int pmu_bus_running;  static struct bus_type pmu_bus = {  	.name		= "event_source", -	.dev_attrs	= pmu_dev_attrs, +	.dev_groups	= pmu_dev_groups,  };  static void pmu_dev_release(struct device *dev) @@ -6767,6 +6807,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,  	if (ret)  		return -EFAULT; +	/* disabled for now */ +	if (attr->mmap2) +		return -EINVAL; +  	if (attr->__reserved_1)  		return -EINVAL; @@ -7122,7 +7166,6 @@ SYSCALL_DEFINE5(perf_event_open,  	}  	perf_install_in_context(ctx, event, event->cpu); -	++ctx->generation;  	perf_unpin_context(ctx);  	mutex_unlock(&ctx->mutex); @@ -7205,7 +7248,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,  	WARN_ON_ONCE(ctx->parent_ctx);  	mutex_lock(&ctx->mutex);  	perf_install_in_context(ctx, event, cpu); -	++ctx->generation;  	perf_unpin_context(ctx);  	mutex_unlock(&ctx->mutex); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ca6599723be5..569b218782ad 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)  }  #define DEFINE_OUTPUT_COPY(func_name, memcpy_func)			\ -static inline unsigned int						\ +static inline unsigned long						\  func_name(struct perf_output_handle *handle,				\ -	  const void *buf, unsigned int len)				\ +	  const void *buf, unsigned long len)				\  {									\  	unsigned long size, written;					\  									\  	do {								\ -		size = min_t(unsigned long, handle->size, len);		\ -									\ +		size    = min(handle->size, len);			\  		written = memcpy_func(handle->addr, buf, size);		\ +		written = size - written;				\  									\  		len -= written;						\  		handle->addr += written;				\ @@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle,				\  	return len;							\  } -static inline int memcpy_common(void *dst, const void *src, size_t n) +static inline unsigned long +memcpy_common(void *dst, const void *src, unsigned long n)  {  	memcpy(dst, src, n); -	return n; +	return 0;  }  DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) -#define MEMCPY_SKIP(dst, src, n) (n) +static inline unsigned long +memcpy_skip(void *dst, const void *src, unsigned long n) +{ +	return 0; +} -DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) +DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)  #ifndef arch_perf_out_copy_user -#define arch_perf_out_copy_user __copy_from_user_inatomic +#define arch_perf_out_copy_user arch_perf_out_copy_user + +static inline unsigned long +arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) +{ +	unsigned long ret; + +	pagefault_disable(); +	ret = __copy_from_user_inatomic(dst, src, n); +	pagefault_enable(); + +	return ret; +}  #endif  DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index cd55144270b5..e8b168af135b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -12,40 +12,10 @@  #include <linux/perf_event.h>  #include <linux/vmalloc.h>  #include <linux/slab.h> +#include <linux/circ_buf.h>  #include "internal.h" -static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, -			      unsigned long offset, unsigned long head) -{ -	unsigned long sz = perf_data_size(rb); -	unsigned long mask = sz - 1; - -	/* -	 * check if user-writable -	 * overwrite : over-write its own tail -	 * !overwrite: buffer possibly drops events. -	 */ -	if (rb->overwrite) -		return true; - -	/* -	 * verify that payload is not bigger than buffer -	 * otherwise masking logic may fail to detect -	 * the "not enough space" condition -	 */ -	if ((head - offset) > sz) -		return false; - -	offset = (offset - tail) & mask; -	head   = (head   - tail) & mask; - -	if ((int)(head - offset) < 0) -		return false; - -	return true; -} -  static void perf_output_wakeup(struct perf_output_handle *handle)  {  	atomic_set(&handle->rb->poll, POLL_IN); @@ -87,15 +57,36 @@ again:  		goto out;  	/* -	 * Publish the known good head. Rely on the full barrier implied -	 * by atomic_dec_and_test() order the rb->head read and this -	 * write. +	 * Since the mmap() consumer (userspace) can run on a different CPU: +	 * +	 *   kernel				user +	 * +	 *   READ ->data_tail			READ ->data_head +	 *   smp_mb()	(A)			smp_rmb()	(C) +	 *   WRITE $data			READ $data +	 *   smp_wmb()	(B)			smp_mb()	(D) +	 *   STORE ->data_head			WRITE ->data_tail +	 * +	 * Where A pairs with D, and B pairs with C. +	 * +	 * I don't think A needs to be a full barrier because we won't in fact +	 * write data until we see the store from userspace. So we simply don't +	 * issue the data WRITE until we observe it. Be conservative for now. +	 * +	 * OTOH, D needs to be a full barrier since it separates the data READ +	 * from the tail WRITE. +	 * +	 * For B a WMB is sufficient since it separates two WRITEs, and for C +	 * an RMB is sufficient since it separates two READs. +	 * +	 * See perf_output_begin().  	 */ +	smp_wmb();  	rb->user_page->data_head = head;  	/* -	 * Now check if we missed an update, rely on the (compiler) -	 * barrier in atomic_dec_and_test() to re-read rb->head. +	 * Now check if we missed an update -- rely on previous implied +	 * compiler barriers to force a re-read.  	 */  	if (unlikely(head != local_read(&rb->head))) {  		local_inc(&rb->nest); @@ -114,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle,  {  	struct ring_buffer *rb;  	unsigned long tail, offset, head; -	int have_lost; -	struct perf_sample_data sample_data; +	int have_lost, page_shift;  	struct {  		struct perf_event_header header;  		u64			 id; @@ -130,55 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle,  		event = event->parent;  	rb = rcu_dereference(event->rb); -	if (!rb) +	if (unlikely(!rb))  		goto out; -	handle->rb	= rb; -	handle->event	= event; - -	if (!rb->nr_pages) +	if (unlikely(!rb->nr_pages))  		goto out; +	handle->rb    = rb; +	handle->event = event; +  	have_lost = local_read(&rb->lost); -	if (have_lost) { -		lost_event.header.size = sizeof(lost_event); -		perf_event_header__init_id(&lost_event.header, &sample_data, -					   event); -		size += lost_event.header.size; +	if (unlikely(have_lost)) { +		size += sizeof(lost_event); +		if (event->attr.sample_id_all) +			size += event->id_header_size;  	}  	perf_output_get_handle(handle);  	do { -		/* -		 * Userspace could choose to issue a mb() before updating the -		 * tail pointer. So that all reads will be completed before the -		 * write is issued. -		 */  		tail = ACCESS_ONCE(rb->user_page->data_tail); -		smp_rmb();  		offset = head = local_read(&rb->head); -		head += size; -		if (unlikely(!perf_output_space(rb, tail, offset, head))) +		if (!rb->overwrite && +		    unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))  			goto fail; +		head += size;  	} while (local_cmpxchg(&rb->head, offset, head) != offset); -	if (head - local_read(&rb->wakeup) > rb->watermark) +	/* +	 * Separate the userpage->tail read from the data stores below. +	 * Matches the MB userspace SHOULD issue after reading the data +	 * and before storing the new tail position. +	 * +	 * See perf_output_put_handle(). +	 */ +	smp_mb(); + +	if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))  		local_add(rb->watermark, &rb->wakeup); -	handle->page = offset >> (PAGE_SHIFT + page_order(rb)); -	handle->page &= rb->nr_pages - 1; -	handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); -	handle->addr = rb->data_pages[handle->page]; -	handle->addr += handle->size; -	handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; +	page_shift = PAGE_SHIFT + page_order(rb); -	if (have_lost) { +	handle->page = (offset >> page_shift) & (rb->nr_pages - 1); +	offset &= (1UL << page_shift) - 1; +	handle->addr = rb->data_pages[handle->page] + offset; +	handle->size = (1UL << page_shift) - offset; + +	if (unlikely(have_lost)) { +		struct perf_sample_data sample_data; + +		lost_event.header.size = sizeof(lost_event);  		lost_event.header.type = PERF_RECORD_LOST;  		lost_event.header.misc = 0;  		lost_event.id          = event->id;  		lost_event.lost        = local_xchg(&rb->lost, 0); +		perf_event_header__init_id(&lost_event.header, +					   &sample_data, event);  		perf_output_put(handle, lost_event);  		perf_event__output_id_sample(event, handle, &sample_data);  	} diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ad8e1bdca70e..24b7d6ca871b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -35,6 +35,7 @@  #include <linux/kdebug.h>	/* notifier mechanism */  #include "../../mm/internal.h"	/* munlock_vma_page */  #include <linux/percpu-rwsem.h> +#include <linux/task_work.h>  #include <linux/uprobes.h> @@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t   * the architecture. If an arch has variable length instruction and the   * breakpoint instruction is not of the smallest length instruction   * supported by that architecture then we need to modify is_trap_at_addr and - * write_opcode accordingly. This would never be a problem for archs that - * have fixed length instructions. + * uprobe_write_opcode accordingly. This would never be a problem for archs + * that have fixed length instructions.   */  /* - * write_opcode - write the opcode at a given virtual address. + * uprobe_write_opcode - write the opcode at a given virtual address.   * @mm: the probed process address space.   * @vaddr: the virtual address to store the opcode.   * @opcode: opcode to be written at @vaddr. @@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t   * For mm @mm, write the opcode at @vaddr.   * Return 0 (success) or a negative errno.   */ -static int write_opcode(struct mm_struct *mm, unsigned long vaddr, +int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,  			uprobe_opcode_t opcode)  {  	struct page *old_page, *new_page; @@ -314,7 +315,7 @@ put_old:   */  int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)  { -	return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); +	return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);  }  /** @@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned  int __weak  set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)  { -	return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); +	return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);  }  static int match_uprobe(struct uprobe *l, struct uprobe *r) @@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)  	return ret;  } -static int -__copy_insn(struct address_space *mapping, struct file *filp, char *insn, -			unsigned long nbytes, loff_t offset) +static int __copy_insn(struct address_space *mapping, struct file *filp, +			void *insn, int nbytes, loff_t offset)  {  	struct page *page; @@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,  static int copy_insn(struct uprobe *uprobe, struct file *filp)  { -	struct address_space *mapping; -	unsigned long nbytes; -	int bytes; - -	nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); -	mapping = uprobe->inode->i_mapping; +	struct address_space *mapping = uprobe->inode->i_mapping; +	loff_t offs = uprobe->offset; +	void *insn = uprobe->arch.insn; +	int size = MAX_UINSN_BYTES; +	int len, err = -EIO; -	/* Instruction at end of binary; copy only available bytes */ -	if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) -		bytes = uprobe->inode->i_size - uprobe->offset; -	else -		bytes = MAX_UINSN_BYTES; +	/* Copy only available bytes, -EIO if nothing was read */ +	do { +		if (offs >= i_size_read(uprobe->inode)) +			break; -	/* Instruction at the page-boundary; copy bytes in second page */ -	if (nbytes < bytes) { -		int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes, -				bytes - nbytes, uprobe->offset + nbytes); +		len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK)); +		err = __copy_insn(mapping, filp, insn, len, offs);  		if (err) -			return err; -		bytes = nbytes; -	} -	return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); +			break; + +		insn += len; +		offs += len; +		size -= len; +	} while (size); + +	return err;  }  static int prepare_uprobe(struct uprobe *uprobe, struct file *file, @@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,  	if (ret)  		goto out; -	/* write_opcode() assumes we don't cross page boundary */ +	/* uprobe_write_opcode() assumes we don't cross page boundary */  	BUG_ON((uprobe->offset & ~PAGE_MASK) +  			UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); @@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon  }  /* Slot allocation for XOL */ -static int xol_add_vma(struct xol_area *area) +static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)  { -	struct mm_struct *mm = current->mm;  	int ret = -EALREADY;  	down_write(&mm->mmap_sem);  	if (mm->uprobes_state.xol_area)  		goto fail; -	ret = -ENOMEM; -	/* Try to map as high as possible, this is only a hint. */ -	area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); -	if (area->vaddr & ~PAGE_MASK) { -		ret = area->vaddr; -		goto fail; +	if (!area->vaddr) { +		/* Try to map as high as possible, this is only a hint. */ +		area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, +						PAGE_SIZE, 0, 0); +		if (area->vaddr & ~PAGE_MASK) { +			ret = area->vaddr; +			goto fail; +		}  	}  	ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, @@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area)  	smp_wmb();	/* pairs with get_xol_area() */  	mm->uprobes_state.xol_area = area; -	ret = 0;   fail:  	up_write(&mm->mmap_sem);  	return ret;  } -/* - * get_xol_area - Allocate process's xol_area if necessary. - * This area will be used for storing instructions for execution out of line. - * - * Returns the allocated area or NULL. - */ -static struct xol_area *get_xol_area(void) +static struct xol_area *__create_xol_area(unsigned long vaddr)  {  	struct mm_struct *mm = current->mm; -	struct xol_area *area;  	uprobe_opcode_t insn = UPROBE_SWBP_INSN; +	struct xol_area *area; -	area = mm->uprobes_state.xol_area; -	if (area) -		goto ret; - -	area = kzalloc(sizeof(*area), GFP_KERNEL); +	area = kmalloc(sizeof(*area), GFP_KERNEL);  	if (unlikely(!area))  		goto out; @@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void)  	if (!area->page)  		goto free_bitmap; -	/* allocate first slot of task's xol_area for the return probes */ +	area->vaddr = vaddr; +	init_waitqueue_head(&area->wq); +	/* Reserve the 1st slot for get_trampoline_vaddr() */  	set_bit(0, area->bitmap); -	copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);  	atomic_set(&area->slot_count, 1); -	init_waitqueue_head(&area->wq); +	copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); -	if (!xol_add_vma(area)) +	if (!xol_add_vma(mm, area))  		return area;  	__free_page(area->page); @@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void)   free_area:  	kfree(area);   out: +	return NULL; +} + +/* + * get_xol_area - Allocate process's xol_area if necessary. + * This area will be used for storing instructions for execution out of line. + * + * Returns the allocated area or NULL. + */ +static struct xol_area *get_xol_area(void) +{ +	struct mm_struct *mm = current->mm; +	struct xol_area *area; + +	if (!mm->uprobes_state.xol_area) +		__create_xol_area(0); +  	area = mm->uprobes_state.xol_area; - ret: -	smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */ +	smp_read_barrier_depends();	/* pairs with wmb in xol_add_vma() */  	return area;  } @@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)  		return 0;  	/* Initialize the slot */ -	copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); +	copy_to_page(area->page, xol_vaddr, +			uprobe->arch.ixol, sizeof(uprobe->arch.ixol));  	/*  	 * We probably need flush_icache_user_range() but it needs vma.  	 * This should work on supported architectures too. @@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t)  }  /* - * Called in context of a new clone/fork from copy_process. - */ -void uprobe_copy_process(struct task_struct *t) -{ -	t->utask = NULL; -} - -/*   * Allocate a uprobe_task object for the task if if necessary.   * Called when the thread hits a breakpoint.   * @@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void)  	return current->utask;  } +static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) +{ +	struct uprobe_task *n_utask; +	struct return_instance **p, *o, *n; + +	n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); +	if (!n_utask) +		return -ENOMEM; +	t->utask = n_utask; + +	p = &n_utask->return_instances; +	for (o = o_utask->return_instances; o; o = o->next) { +		n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); +		if (!n) +			return -ENOMEM; + +		*n = *o; +		atomic_inc(&n->uprobe->ref); +		n->next = NULL; + +		*p = n; +		p = &n->next; +		n_utask->depth++; +	} + +	return 0; +} + +static void uprobe_warn(struct task_struct *t, const char *msg) +{ +	pr_warn("uprobe: %s:%d failed to %s\n", +			current->comm, current->pid, msg); +} + +static void dup_xol_work(struct callback_head *work) +{ +	kfree(work); + +	if (current->flags & PF_EXITING) +		return; + +	if (!__create_xol_area(current->utask->vaddr)) +		uprobe_warn(current, "dup xol area"); +} + +/* + * Called in context of a new clone/fork from copy_process. + */ +void uprobe_copy_process(struct task_struct *t, unsigned long flags) +{ +	struct uprobe_task *utask = current->utask; +	struct mm_struct *mm = current->mm; +	struct callback_head *work; +	struct xol_area *area; + +	t->utask = NULL; + +	if (!utask || !utask->return_instances) +		return; + +	if (mm == t->mm && !(flags & CLONE_VFORK)) +		return; + +	if (dup_utask(t, utask)) +		return uprobe_warn(t, "dup ret instances"); + +	/* The task can fork() after dup_xol_work() fails */ +	area = mm->uprobes_state.xol_area; +	if (!area) +		return uprobe_warn(t, "dup xol area"); + +	if (mm == t->mm) +		return; + +	/* TODO: move it into the union in uprobe_task */ +	work = kmalloc(sizeof(*work), GFP_KERNEL); +	if (!work) +		return uprobe_warn(t, "dup xol area"); + +	t->utask->vaddr = area->vaddr; +	init_task_work(work, dup_xol_work); +	task_work_add(t, work, true); +} +  /*   * Current area->vaddr notion assume the trampoline address is always   * equal area->vaddr. @@ -1857,9 +1941,4 @@ static int __init init_uprobes(void)  	return register_die_notifier(&uprobe_exception_nb);  } -module_init(init_uprobes); - -static void __exit exit_uprobes(void) -{ -} -module_exit(exit_uprobes); +__initcall(init_uprobes); diff --git a/kernel/extable.c b/kernel/extable.c index 832cb28105bb..763faf037ec1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)  static inline int init_kernel_text(unsigned long addr)  {  	if (addr >= (unsigned long)_sinittext && -	    addr <= (unsigned long)_einittext) +	    addr < (unsigned long)_einittext)  		return 1;  	return 0;  } @@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)  int core_kernel_text(unsigned long addr)  {  	if (addr >= (unsigned long)_stext && -	    addr <= (unsigned long)_etext) +	    addr < (unsigned long)_etext)  		return 1;  	if (system_state == SYSTEM_BOOTING && diff --git a/kernel/fork.c b/kernel/fork.c index 086fe73ad6bd..728d5be9548c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)  	mm->flags = (current->mm) ?  		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;  	mm->core_state = NULL; -	mm->nr_ptes = 0; +	atomic_long_set(&mm->nr_ptes, 0);  	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));  	spin_lock_init(&mm->page_table_lock);  	mm_init_aio(mm); @@ -560,7 +560,7 @@ static void check_mm(struct mm_struct *mm)  					  "mm:%p idx:%d val:%ld\n", mm, i, x);  	} -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS  	VM_BUG_ON(mm->pmd_huge_pte);  #endif  } @@ -814,12 +814,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)  	memcpy(mm, oldmm, sizeof(*mm));  	mm_init_cpumask(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS  	mm->pmd_huge_pte = NULL;  #endif -#ifdef CONFIG_NUMA_BALANCING -	mm->first_nid = NUMA_PTE_SCAN_INIT; -#endif  	if (!mm_init(mm, tsk))  		goto fail_nomem; @@ -1313,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  #endif  	/* Perform scheduler related setup. Assign this task to a CPU. */ -	sched_fork(p); +	sched_fork(clone_flags, p);  	retval = perf_event_init_task(p);  	if (retval) @@ -1373,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	INIT_LIST_HEAD(&p->pi_state_list);  	p->pi_state_cache = NULL;  #endif -	uprobe_copy_process(p);  	/*  	 * sigaltstack should be cleared when sharing the same VM  	 */ @@ -1490,6 +1486,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	perf_event_fork(p);  	trace_task_newtask(p, clone_flags); +	uprobe_copy_process(p, clone_flags);  	return p; diff --git a/kernel/futex.c b/kernel/futex.c index c3a1a55a5214..80ba086f021d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -66,7 +66,7 @@  #include <asm/futex.h> -#include "rtmutex_common.h" +#include "locking/rtmutex_common.h"  int __read_mostly futex_cmpxchg_enabled; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index d4da55d1fb65..d04ce8ac4399 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -46,4 +46,34 @@ config GCOV_PROFILE_ALL  	larger and run slower. Also be sure to exclude files from profiling  	which are not linked to the kernel image to prevent linker errors. +choice +	prompt "Specify GCOV format" +	depends on GCOV_KERNEL +	default GCOV_FORMAT_AUTODETECT +	---help--- +	The gcov format is usually determined by the GCC version, but there are +	exceptions where format changes are integrated in lower-version GCCs. +	In such a case use this option to adjust the format used in the kernel +	accordingly. + +	If unsure, choose "Autodetect". + +config GCOV_FORMAT_AUTODETECT +	bool "Autodetect" +	---help--- +	Select this option to use the format that corresponds to your GCC +	version. + +config GCOV_FORMAT_3_4 +	bool "GCC 3.4 format" +	---help--- +	Select this option to use the format defined by GCC 3.4. + +config GCOV_FORMAT_4_7 +	bool "GCC 4.7 format" +	---help--- +	Select this option to use the format defined by GCC 4.7. + +endchoice +  endmenu diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index e97ca59e2520..52aa7e8de927 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,3 +1,33 @@  ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' -obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o +# if-lt +# Usage VAR := $(call if-lt, $(a), $(b)) +# Returns 1 if (a < b) +if-lt = $(shell [ $(1) -lt $(2) ] && echo 1) + +ifeq ($(CONFIG_GCOV_FORMAT_3_4),y) +  cc-ver := 0304 +else ifeq ($(CONFIG_GCOV_FORMAT_4_7),y) +  cc-ver := 0407 +else +# Use cc-version if available, otherwise set 0 +# +# scripts/Kbuild.include, which contains cc-version function, is not included +# during make clean "make -f scripts/Makefile.clean obj=kernel/gcov" +# Meaning cc-ver is empty causing if-lt test to fail with +# "/bin/sh: line 0: [: -lt: unary operator expected" error mesage. +# This has no affect on the clean phase, but the error message could be +# confusing/annoying. So this dummy workaround sets cc-ver to zero if cc-version +# is not available. We can probably move if-lt to Kbuild.include, so it's also +# not defined during clean or to include Kbuild.include in +# scripts/Makefile.clean. But the following workaround seems least invasive. +  cc-ver := $(if $(call cc-version),$(call cc-version),0) +endif + +obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o + +ifeq ($(call if-lt, $(cc-ver), 0407),1) +  obj-$(CONFIG_GCOV_KERNEL) += gcc_3_4.o +else +  obj-$(CONFIG_GCOV_KERNEL) += gcc_4_7.o +endif diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 9b22d03cc581..f45b75b713c0 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -20,7 +20,6 @@  #include <linux/mutex.h>  #include "gcov.h" -static struct gcov_info *gcov_info_head;  static int gcov_events_enabled;  static DEFINE_MUTEX(gcov_lock); @@ -34,7 +33,7 @@ void __gcov_init(struct gcov_info *info)  	mutex_lock(&gcov_lock);  	if (gcov_version == 0) { -		gcov_version = info->version; +		gcov_version = gcov_info_version(info);  		/*  		 * Printing gcc's version magic may prove useful for debugging  		 * incompatibility reports. @@ -45,8 +44,7 @@ void __gcov_init(struct gcov_info *info)  	 * Add new profiling data structure to list and inform event  	 * listener.  	 */ -	info->next = gcov_info_head; -	gcov_info_head = info; +	gcov_info_link(info);  	if (gcov_events_enabled)  		gcov_event(GCOV_ADD, info);  	mutex_unlock(&gcov_lock); @@ -81,6 +79,12 @@ void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)  }  EXPORT_SYMBOL(__gcov_merge_delta); +void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) +{ +	/* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_ior); +  /**   * gcov_enable_events - enable event reporting through gcov_event()   * @@ -91,13 +95,15 @@ EXPORT_SYMBOL(__gcov_merge_delta);   */  void gcov_enable_events(void)  { -	struct gcov_info *info; +	struct gcov_info *info = NULL;  	mutex_lock(&gcov_lock);  	gcov_events_enabled = 1; +  	/* Perform event callback for previously registered entries. */ -	for (info = gcov_info_head; info; info = info->next) +	while ((info = gcov_info_next(info)))  		gcov_event(GCOV_ADD, info); +  	mutex_unlock(&gcov_lock);  } @@ -112,25 +118,23 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,  				void *data)  {  	struct module *mod = data; -	struct gcov_info *info; -	struct gcov_info *prev; +	struct gcov_info *info = NULL; +	struct gcov_info *prev = NULL;  	if (event != MODULE_STATE_GOING)  		return NOTIFY_OK;  	mutex_lock(&gcov_lock); -	prev = NULL; +  	/* Remove entries located in module from linked list. */ -	for (info = gcov_info_head; info; info = info->next) { +	while ((info = gcov_info_next(info))) {  		if (within(info, mod->module_core, mod->core_size)) { -			if (prev) -				prev->next = info->next; -			else -				gcov_info_head = info->next; +			gcov_info_unlink(prev, info);  			if (gcov_events_enabled)  				gcov_event(GCOV_REMOVE, info);  		} else  			prev = info;  	} +  	mutex_unlock(&gcov_lock);  	return NOTIFY_OK; diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 7a7d2ee96d42..15ff01a76379 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -75,7 +75,7 @@ static int __init gcov_persist_setup(char *str)  	unsigned long val;  	if (kstrtoul(str, 0, &val)) { -		pr_warning("invalid gcov_persist parameter '%s'\n", str); +		pr_warn("invalid gcov_persist parameter '%s'\n", str);  		return 0;  	}  	gcov_persist = val; @@ -242,7 +242,7 @@ static struct gcov_node *get_node_by_name(const char *name)  	list_for_each_entry(node, &all_head, all) {  		info = get_node_info(node); -		if (info && (strcmp(info->filename, name) == 0)) +		if (info && (strcmp(gcov_info_filename(info), name) == 0))  			return node;  	} @@ -279,7 +279,7 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,  	seq = file->private_data;  	info = gcov_iter_get_info(seq->private);  	mutex_lock(&node_lock); -	node = get_node_by_name(info->filename); +	node = get_node_by_name(gcov_info_filename(info));  	if (node) {  		/* Reset counts or remove node for unloaded modules. */  		if (node->num_loaded == 0) @@ -365,7 +365,7 @@ static const char *deskew(const char *basename)   */  static void add_links(struct gcov_node *node, struct dentry *parent)  { -	char *basename; +	const char *basename;  	char *target;  	int num;  	int i; @@ -376,14 +376,14 @@ static void add_links(struct gcov_node *node, struct dentry *parent)  	if (!node->links)  		return;  	for (i = 0; i < num; i++) { -		target = get_link_target(get_node_info(node)->filename, -					 &gcov_link[i]); +		target = get_link_target( +				gcov_info_filename(get_node_info(node)), +				&gcov_link[i]);  		if (!target)  			goto out_err; -		basename = strrchr(target, '/'); -		if (!basename) +		basename = kbasename(target); +		if (basename == target)  			goto out_err; -		basename++;  		node->links[i] = debugfs_create_symlink(deskew(basename),  							parent,	target);  		if (!node->links[i]) @@ -450,7 +450,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,  	} else  		node->dentry = debugfs_create_dir(node->name, parent->dentry);  	if (!node->dentry) { -		pr_warning("could not create file\n"); +		pr_warn("could not create file\n");  		kfree(node);  		return NULL;  	} @@ -463,7 +463,7 @@ static struct gcov_node *new_node(struct gcov_node *parent,  err_nomem:  	kfree(node); -	pr_warning("out of memory\n"); +	pr_warn("out of memory\n");  	return NULL;  } @@ -576,7 +576,7 @@ static void add_node(struct gcov_info *info)  	struct gcov_node *parent;  	struct gcov_node *node; -	filename = kstrdup(info->filename, GFP_KERNEL); +	filename = kstrdup(gcov_info_filename(info), GFP_KERNEL);  	if (!filename)  		return;  	parent = &root_node; @@ -630,8 +630,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)  	 */  	loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);  	if (!loaded_info) { -		pr_warning("could not add '%s' (out of memory)\n", -			   info->filename); +		pr_warn("could not add '%s' (out of memory)\n", +			gcov_info_filename(info));  		return;  	}  	memcpy(loaded_info, node->loaded_info, @@ -644,8 +644,9 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)  		 * data set replaces the copy of the last one.  		 */  		if (!gcov_info_is_compatible(node->unloaded_info, info)) { -			pr_warning("discarding saved data for %s " -				   "(incompatible version)\n", info->filename); +			pr_warn("discarding saved data for %s " +				"(incompatible version)\n", +				gcov_info_filename(info));  			gcov_info_free(node->unloaded_info);  			node->unloaded_info = NULL;  		} @@ -655,8 +656,8 @@ static void add_info(struct gcov_node *node, struct gcov_info *info)  		 * The initial one takes precedence.  		 */  		if (!gcov_info_is_compatible(node->loaded_info[0], info)) { -			pr_warning("could not add '%s' (incompatible " -				   "version)\n", info->filename); +			pr_warn("could not add '%s' (incompatible " +				"version)\n", gcov_info_filename(info));  			kfree(loaded_info);  			return;  		} @@ -691,8 +692,9 @@ static void save_info(struct gcov_node *node, struct gcov_info *info)  	else {  		node->unloaded_info = gcov_info_dup(info);  		if (!node->unloaded_info) { -			pr_warning("could not save data for '%s' " -				   "(out of memory)\n", info->filename); +			pr_warn("could not save data for '%s' " +				"(out of memory)\n", +				gcov_info_filename(info));  		}  	}  } @@ -707,8 +709,8 @@ static void remove_info(struct gcov_node *node, struct gcov_info *info)  	i = get_info_index(node, info);  	if (i < 0) { -		pr_warning("could not remove '%s' (not found)\n", -			   info->filename); +		pr_warn("could not remove '%s' (not found)\n", +			gcov_info_filename(info));  		return;  	}  	if (gcov_persist) @@ -735,7 +737,7 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)  	struct gcov_node *node;  	mutex_lock(&node_lock); -	node = get_node_by_name(info->filename); +	node = get_node_by_name(gcov_info_filename(info));  	switch (action) {  	case GCOV_ADD:  		if (node) @@ -747,8 +749,8 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)  		if (node)  			remove_info(node, info);  		else { -			pr_warning("could not remove '%s' (not found)\n", -				   info->filename); +			pr_warn("could not remove '%s' (not found)\n", +				gcov_info_filename(info));  		}  		break;  	} diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index ae5bb4260033..27bc88a35013 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -21,6 +21,121 @@  #include <linux/vmalloc.h>  #include "gcov.h" +#define GCOV_COUNTERS		5 + +static struct gcov_info *gcov_info_head; + +/** + * struct gcov_fn_info - profiling meta data per function + * @ident: object file-unique function identifier + * @checksum: function checksum + * @n_ctrs: number of values per counter type belonging to this function + * + * This data is generated by gcc during compilation and doesn't change + * at run-time. + */ +struct gcov_fn_info { +	unsigned int ident; +	unsigned int checksum; +	unsigned int n_ctrs[0]; +}; + +/** + * struct gcov_ctr_info - profiling data per counter type + * @num: number of counter values for this type + * @values: array of counter values for this type + * @merge: merge function for counter values of this type (unused) + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the values array. + */ +struct gcov_ctr_info { +	unsigned int	num; +	gcov_type	*values; +	void		(*merge)(gcov_type *, unsigned int); +}; + +/** + * struct gcov_info - profiling data per object file + * @version: gcov version magic indicating the gcc version used for compilation + * @next: list head for a singly-linked list + * @stamp: time stamp + * @filename: name of the associated gcov data file + * @n_functions: number of instrumented functions + * @functions: function data + * @ctr_mask: mask specifying which counter types are active + * @counts: counter data per counter type + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the next pointer. + */ +struct gcov_info { +	unsigned int			version; +	struct gcov_info		*next; +	unsigned int			stamp; +	const char			*filename; +	unsigned int			n_functions; +	const struct gcov_fn_info	*functions; +	unsigned int			ctr_mask; +	struct gcov_ctr_info		counts[0]; +}; + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ +	return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ +	return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ +	if (!info) +		return gcov_info_head; + +	return info->next; +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ +	info->next = gcov_info_head; +	gcov_info_head = info; +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ +	if (prev) +		prev->next = info->next; +	else +		gcov_info_head = info->next; +} +  /* Symbolic links to be created for each profiling data file. */  const struct gcov_link gcov_link[] = {  	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c new file mode 100644 index 000000000000..2c6e4631c814 --- /dev/null +++ b/kernel/gcov/gcc_4_7.c @@ -0,0 +1,560 @@ +/* + *  This code provides functions to handle gcc's profiling data format + *  introduced with gcc 4.7. + * + *  This file is based heavily on gcc_3_4.c file. + * + *  For a better understanding, refer to gcc source: + *  gcc/gcov-io.h + *  libgcc/libgcov.c + * + *  Uses gcc-internal data definitions. + */ + +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/vmalloc.h> +#include "gcov.h" + +#define GCOV_COUNTERS			8 +#define GCOV_TAG_FUNCTION_LENGTH	3 + +static struct gcov_info *gcov_info_head; + +/** + * struct gcov_ctr_info - information about counters for a single function + * @num: number of counter values for this type + * @values: array of counter values for this type + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the values array. + */ +struct gcov_ctr_info { +	unsigned int num; +	gcov_type *values; +}; + +/** + * struct gcov_fn_info - profiling meta data per function + * @key: comdat key + * @ident: unique ident of function + * @lineno_checksum: function lineo_checksum + * @cfg_checksum: function cfg checksum + * @ctrs: instrumented counters + * + * This data is generated by gcc during compilation and doesn't change + * at run-time. + * + * Information about a single function.  This uses the trailing array + * idiom. The number of counters is determined from the merge pointer + * array in gcov_info.  The key is used to detect which of a set of + * comdat functions was selected -- it points to the gcov_info object + * of the object file containing the selected comdat function. + */ +struct gcov_fn_info { +	const struct gcov_info *key; +	unsigned int ident; +	unsigned int lineno_checksum; +	unsigned int cfg_checksum; +	struct gcov_ctr_info ctrs[0]; +}; + +/** + * struct gcov_info - profiling data per object file + * @version: gcov version magic indicating the gcc version used for compilation + * @next: list head for a singly-linked list + * @stamp: uniquifying time stamp + * @filename: name of the associated gcov data file + * @merge: merge functions (null for unused counter type) + * @n_functions: number of instrumented functions + * @functions: pointer to pointers to function information + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the next pointer. + */ +struct gcov_info { +	unsigned int version; +	struct gcov_info *next; +	unsigned int stamp; +	const char *filename; +	void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int); +	unsigned int n_functions; +	struct gcov_fn_info **functions; +}; + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ +	return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ +	return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ +	if (!info) +		return gcov_info_head; + +	return info->next; +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ +	info->next = gcov_info_head; +	gcov_info_head = info; +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ +	if (prev) +		prev->next = info->next; +	else +		gcov_info_head = info->next; +} + +/* Symbolic links to be created for each profiling data file. */ +const struct gcov_link gcov_link[] = { +	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */ +	{ 0, NULL}, +}; + +/* + * Determine whether a counter is active. Doesn't change at run-time. + */ +static int counter_active(struct gcov_info *info, unsigned int type) +{ +	return info->merge[type] ? 1 : 0; +} + +/* Determine number of active counters. Based on gcc magic. */ +static unsigned int num_counter_active(struct gcov_info *info) +{ +	unsigned int i; +	unsigned int result = 0; + +	for (i = 0; i < GCOV_COUNTERS; i++) { +		if (counter_active(info, i)) +			result++; +	} +	return result; +} + +/** + * gcov_info_reset - reset profiling data to zero + * @info: profiling data set + */ +void gcov_info_reset(struct gcov_info *info) +{ +	struct gcov_ctr_info *ci_ptr; +	unsigned int fi_idx; +	unsigned int ct_idx; + +	for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { +		ci_ptr = info->functions[fi_idx]->ctrs; + +		for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { +			if (!counter_active(info, ct_idx)) +				continue; + +			memset(ci_ptr->values, 0, +					sizeof(gcov_type) * ci_ptr->num); +			ci_ptr++; +		} +	} +} + +/** + * gcov_info_is_compatible - check if profiling data can be added + * @info1: first profiling data set + * @info2: second profiling data set + * + * Returns non-zero if profiling data can be added, zero otherwise. + */ +int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) +{ +	return (info1->stamp == info2->stamp); +} + +/** + * gcov_info_add - add up profiling data + * @dest: profiling data set to which data is added + * @source: profiling data set which is added + * + * Adds profiling counts of @source to @dest. + */ +void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) +{ +	struct gcov_ctr_info *dci_ptr; +	struct gcov_ctr_info *sci_ptr; +	unsigned int fi_idx; +	unsigned int ct_idx; +	unsigned int val_idx; + +	for (fi_idx = 0; fi_idx < src->n_functions; fi_idx++) { +		dci_ptr = dst->functions[fi_idx]->ctrs; +		sci_ptr = src->functions[fi_idx]->ctrs; + +		for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { +			if (!counter_active(src, ct_idx)) +				continue; + +			for (val_idx = 0; val_idx < sci_ptr->num; val_idx++) +				dci_ptr->values[val_idx] += +					sci_ptr->values[val_idx]; + +			dci_ptr++; +			sci_ptr++; +		} +	} +} + +/** + * gcov_info_dup - duplicate profiling data set + * @info: profiling data set to duplicate + * + * Return newly allocated duplicate on success, %NULL on error. + */ +struct gcov_info *gcov_info_dup(struct gcov_info *info) +{ +	struct gcov_info *dup; +	struct gcov_ctr_info *dci_ptr; /* dst counter info */ +	struct gcov_ctr_info *sci_ptr; /* src counter info */ +	unsigned int active; +	unsigned int fi_idx; /* function info idx */ +	unsigned int ct_idx; /* counter type idx */ +	size_t fi_size; /* function info size */ +	size_t cv_size; /* counter values size */ + +	dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); +	if (!dup) +		return NULL; + +	dup->next = NULL; +	dup->filename = NULL; +	dup->functions = NULL; + +	dup->filename = kstrdup(info->filename, GFP_KERNEL); +	if (!dup->filename) +		goto err_free; + +	dup->functions = kcalloc(info->n_functions, +				 sizeof(struct gcov_fn_info *), GFP_KERNEL); +	if (!dup->functions) +		goto err_free; + +	active = num_counter_active(info); +	fi_size = sizeof(struct gcov_fn_info); +	fi_size += sizeof(struct gcov_ctr_info) * active; + +	for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { +		dup->functions[fi_idx] = kzalloc(fi_size, GFP_KERNEL); +		if (!dup->functions[fi_idx]) +			goto err_free; + +		*(dup->functions[fi_idx]) = *(info->functions[fi_idx]); + +		sci_ptr = info->functions[fi_idx]->ctrs; +		dci_ptr = dup->functions[fi_idx]->ctrs; + +		for (ct_idx = 0; ct_idx < active; ct_idx++) { + +			cv_size = sizeof(gcov_type) * sci_ptr->num; + +			dci_ptr->values = vmalloc(cv_size); + +			if (!dci_ptr->values) +				goto err_free; + +			dci_ptr->num = sci_ptr->num; +			memcpy(dci_ptr->values, sci_ptr->values, cv_size); + +			sci_ptr++; +			dci_ptr++; +		} +	} + +	return dup; +err_free: +	gcov_info_free(dup); +	return NULL; +} + +/** + * gcov_info_free - release memory for profiling data set duplicate + * @info: profiling data set duplicate to free + */ +void gcov_info_free(struct gcov_info *info) +{ +	unsigned int active; +	unsigned int fi_idx; +	unsigned int ct_idx; +	struct gcov_ctr_info *ci_ptr; + +	if (!info->functions) +		goto free_info; + +	active = num_counter_active(info); + +	for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { +		if (!info->functions[fi_idx]) +			continue; + +		ci_ptr = info->functions[fi_idx]->ctrs; + +		for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++) +			vfree(ci_ptr->values); + +		kfree(info->functions[fi_idx]); +	} + +free_info: +	kfree(info->functions); +	kfree(info->filename); +	kfree(info); +} + +#define ITER_STRIDE	PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { +	struct gcov_info *info; +	void *buffer; +	size_t size; +	loff_t pos; +}; + +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +static size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ +	u32 *data; + +	if (buffer) { +		data = buffer + off; +		*data = v; +	} + +	return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +static size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ +	u32 *data; + +	if (buffer) { +		data = buffer + off; + +		data[0] = (v & 0xffffffffUL); +		data[1] = (v >> 32); +	} + +	return sizeof(*data) * 2; +} + +/** + * convert_to_gcda - convert profiling data set to gcda file format + * @buffer: the buffer to store file data or %NULL if no data should be stored + * @info: profiling data set to be converted + * + * Returns the number of bytes that were/would have been stored into the buffer. + */ +static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +{ +	struct gcov_fn_info *fi_ptr; +	struct gcov_ctr_info *ci_ptr; +	unsigned int fi_idx; +	unsigned int ct_idx; +	unsigned int cv_idx; +	size_t pos = 0; + +	/* File header. */ +	pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); +	pos += store_gcov_u32(buffer, pos, info->version); +	pos += store_gcov_u32(buffer, pos, info->stamp); + +	for (fi_idx = 0; fi_idx < info->n_functions; fi_idx++) { +		fi_ptr = info->functions[fi_idx]; + +		/* Function record. */ +		pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); +		pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION_LENGTH); +		pos += store_gcov_u32(buffer, pos, fi_ptr->ident); +		pos += store_gcov_u32(buffer, pos, fi_ptr->lineno_checksum); +		pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); + +		ci_ptr = fi_ptr->ctrs; + +		for (ct_idx = 0; ct_idx < GCOV_COUNTERS; ct_idx++) { +			if (!counter_active(info, ct_idx)) +				continue; + +			/* Counter record. */ +			pos += store_gcov_u32(buffer, pos, +					      GCOV_TAG_FOR_COUNTER(ct_idx)); +			pos += store_gcov_u32(buffer, pos, ci_ptr->num * 2); + +			for (cv_idx = 0; cv_idx < ci_ptr->num; cv_idx++) { +				pos += store_gcov_u64(buffer, pos, +						      ci_ptr->values[cv_idx]); +			} + +			ci_ptr++; +		} +	} + +	return pos; +} + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ +	struct gcov_iterator *iter; + +	iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); +	if (!iter) +		goto err_free; + +	iter->info = info; +	/* Dry-run to get the actual buffer size. */ +	iter->size = convert_to_gcda(NULL, info); +	iter->buffer = vmalloc(iter->size); +	if (!iter->buffer) +		goto err_free; + +	convert_to_gcda(iter->buffer, info); + +	return iter; + +err_free: +	kfree(iter); +	return NULL; +} + + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +void gcov_iter_free(struct gcov_iterator *iter) +{ +	vfree(iter->buffer); +	kfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ +	return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +void gcov_iter_start(struct gcov_iterator *iter) +{ +	iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +int gcov_iter_next(struct gcov_iterator *iter) +{ +	if (iter->pos < iter->size) +		iter->pos += ITER_STRIDE; + +	if (iter->pos >= iter->size) +		return -EINVAL; + +	return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ +	size_t len; + +	if (iter->pos >= iter->size) +		return -EINVAL; + +	len = ITER_STRIDE; +	if (iter->pos + len > iter->size) +		len = iter->size - iter->pos; + +	seq_write(seq, iter->buffer + iter->pos, len); + +	return 0; +} diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 060073ebf7a6..92c8e22a29ed 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -21,7 +21,6 @@   * gcc and need to be kept as close to the original definition as possible to   * remain compatible.   */ -#define GCOV_COUNTERS		5  #define GCOV_DATA_MAGIC		((unsigned int) 0x67636461)  #define GCOV_TAG_FUNCTION	((unsigned int) 0x01000000)  #define GCOV_TAG_COUNTER_BASE	((unsigned int) 0x01a10000) @@ -34,60 +33,18 @@ typedef long gcov_type;  typedef long long gcov_type;  #endif -/** - * struct gcov_fn_info - profiling meta data per function - * @ident: object file-unique function identifier - * @checksum: function checksum - * @n_ctrs: number of values per counter type belonging to this function - * - * This data is generated by gcc during compilation and doesn't change - * at run-time. - */ -struct gcov_fn_info { -	unsigned int ident; -	unsigned int checksum; -	unsigned int n_ctrs[0]; -}; - -/** - * struct gcov_ctr_info - profiling data per counter type - * @num: number of counter values for this type - * @values: array of counter values for this type - * @merge: merge function for counter values of this type (unused) - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the values array. - */ -struct gcov_ctr_info { -	unsigned int	num; -	gcov_type	*values; -	void		(*merge)(gcov_type *, unsigned int); -}; +/* Opaque gcov_info. The gcov structures can change as for example in gcc 4.7 so + * we cannot use full definition here and they need to be placed in gcc specific + * implementation of gcov. This also means no direct access to the members in + * generic code and usage of the interface below.*/ +struct gcov_info; -/** - * struct gcov_info - profiling data per object file - * @version: gcov version magic indicating the gcc version used for compilation - * @next: list head for a singly-linked list - * @stamp: time stamp - * @filename: name of the associated gcov data file - * @n_functions: number of instrumented functions - * @functions: function data - * @ctr_mask: mask specifying which counter types are active - * @counts: counter data per counter type - * - * This data is generated by gcc during compilation and doesn't change - * at run-time with the exception of the next pointer. - */ -struct gcov_info { -	unsigned int			version; -	struct gcov_info		*next; -	unsigned int			stamp; -	const char			*filename; -	unsigned int			n_functions; -	const struct gcov_fn_info	*functions; -	unsigned int			ctr_mask; -	struct gcov_ctr_info		counts[0]; -}; +/* Interface to access gcov_info data  */ +const char *gcov_info_filename(struct gcov_info *info); +unsigned int gcov_info_version(struct gcov_info *info); +struct gcov_info *gcov_info_next(struct gcov_info *info); +void gcov_info_link(struct gcov_info *info); +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);  /* Base interface. */  enum gcov_action { diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 3e97fb126e6b..9328b80eaf14 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -16,11 +16,12 @@  #include <linux/export.h>  #include <linux/sysctl.h>  #include <linux/utsname.h> +#include <trace/events/sched.h>  /*   * The number of tasks checked:   */ -unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; +int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;  /*   * Limit number of tasks checked in a batch. @@ -92,6 +93,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)  		t->last_switch_count = switch_count;  		return;  	} + +	trace_sched_process_hang(t); +  	if (!sysctl_hung_task_warnings)  		return;  	sysctl_hung_task_warnings--; @@ -203,6 +207,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,  	return ret;  } +static atomic_t reset_hung_task = ATOMIC_INIT(0); + +void reset_hung_task_detector(void) +{ +	atomic_set(&reset_hung_task, 1); +} +EXPORT_SYMBOL_GPL(reset_hung_task_detector); +  /*   * kthread which checks for tasks stuck in D state   */ @@ -216,6 +228,9 @@ static int watchdog(void *dummy)  		while (schedule_timeout_interruptible(timeout_jiffies(timeout)))  			timeout = sysctl_hung_task_timeout_secs; +		if (atomic_xchg(&reset_hung_task, 0)) +			continue; +  		check_hung_uninterruptible_tasks(timeout);  	} diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a3bb14fbe5c6..dc04c166c54d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -214,7 +214,7 @@ void irq_enable(struct irq_desc *desc)  }  /** - * irq_disable - Mark interupt disabled + * irq_disable - Mark interrupt disabled   * @desc:	irq descriptor which should be disabled   *   * If the chip does not implement the irq_disable callback, we diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 706724e9835d..cf68bb36fe58 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -465,27 +465,26 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,  }  EXPORT_SYMBOL_GPL(irq_create_strict_mappings); -unsigned int irq_create_of_mapping(struct device_node *controller, -				   const u32 *intspec, unsigned int intsize) +unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)  {  	struct irq_domain *domain;  	irq_hw_number_t hwirq;  	unsigned int type = IRQ_TYPE_NONE;  	unsigned int virq; -	domain = controller ? irq_find_host(controller) : irq_default_domain; +	domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;  	if (!domain) {  		pr_warn("no irq domain found for %s !\n", -			of_node_full_name(controller)); +			of_node_full_name(irq_data->np));  		return 0;  	}  	/* If domain has no translation, then we assume interrupt line */  	if (domain->ops->xlate == NULL) -		hwirq = intspec[0]; +		hwirq = irq_data->args[0];  	else { -		if (domain->ops->xlate(domain, controller, intspec, intsize, -				     &hwirq, &type)) +		if (domain->ops->xlate(domain, irq_data->np, irq_data->args, +					irq_data->args_count, &hwirq, &type))  			return 0;  	} diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 514bcfd855a8..481a13c43b17 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -786,7 +786,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)  }  /* - * Interrupts explicitely requested as threaded interupts want to be + * Interrupts explicitly requested as threaded interrupts want to be   * preemtible - many of them need to sleep and wait for slow busses to   * complete.   */ @@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)  			goto out_mput;  		} -		sched_setscheduler(t, SCHED_FIFO, ¶m); +		sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m);  		/*  		 * We keep the reference to the task struct even if diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 1162f1030f18..3320b84cc60f 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -14,6 +14,7 @@ enum {  	_IRQ_NO_BALANCING	= IRQ_NO_BALANCING,  	_IRQ_NESTED_THREAD	= IRQ_NESTED_THREAD,  	_IRQ_PER_CPU_DEVID	= IRQ_PER_CPU_DEVID, +	_IRQ_IS_POLLED		= IRQ_IS_POLLED,  	_IRQF_MODIFY_MASK	= IRQF_MODIFY_MASK,  }; @@ -26,6 +27,7 @@ enum {  #define IRQ_NOAUTOEN		GOT_YOU_MORON  #define IRQ_NESTED_THREAD	GOT_YOU_MORON  #define IRQ_PER_CPU_DEVID	GOT_YOU_MORON +#define IRQ_IS_POLLED		GOT_YOU_MORON  #undef IRQF_MODIFY_MASK  #define IRQF_MODIFY_MASK	GOT_YOU_MORON @@ -147,3 +149,8 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)  {  	return desc->status_use_accessors & _IRQ_NESTED_THREAD;  } + +static inline bool irq_settings_is_polled(struct irq_desc *desc) +{ +	return desc->status_use_accessors & _IRQ_IS_POLLED; +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 7b5f012bde9d..a1d8cc63b56e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -67,8 +67,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)  	raw_spin_lock(&desc->lock); -	/* PER_CPU and nested thread interrupts are never polled */ -	if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) +	/* +	 * PER_CPU, nested thread interrupts and interrupts explicitely +	 * marked polled are excluded from polling. +	 */ +	if (irq_settings_is_per_cpu(desc) || +	    irq_settings_is_nested_thread(desc) || +	    irq_settings_is_polled(desc))  		goto out;  	/* @@ -268,7 +273,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,  void note_interrupt(unsigned int irq, struct irq_desc *desc,  		    irqreturn_t action_ret)  { -	if (desc->istate & IRQS_POLL_INPROGRESS) +	if (desc->istate & IRQS_POLL_INPROGRESS || +	    irq_settings_is_polled(desc))  		return;  	/* we get here again via the threaded handler */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 297a9247a3b3..9019f15deab2 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -58,6 +58,7 @@ static void jump_label_update(struct static_key *key, int enable);  void static_key_slow_inc(struct static_key *key)  { +	STATIC_KEY_CHECK_USE();  	if (atomic_inc_not_zero(&key->enabled))  		return; @@ -103,12 +104,14 @@ static void jump_label_update_timeout(struct work_struct *work)  void static_key_slow_dec(struct static_key *key)  { +	STATIC_KEY_CHECK_USE();  	__static_key_slow_dec(key, 0, NULL);  }  EXPORT_SYMBOL_GPL(static_key_slow_dec);  void static_key_slow_dec_deferred(struct static_key_deferred *key)  { +	STATIC_KEY_CHECK_USE();  	__static_key_slow_dec(&key->key, key->timeout, &key->work);  }  EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); @@ -116,6 +119,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);  void jump_label_rate_limit(struct static_key_deferred *key,  		unsigned long rl)  { +	STATIC_KEY_CHECK_USE();  	key->timeout = rl;  	INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);  } @@ -212,6 +216,7 @@ void __init jump_label_init(void)  		key->next = NULL;  #endif  	} +	static_key_initialized = true;  	jump_label_unlock();  } diff --git a/kernel/kexec.c b/kernel/kexec.c index 2a74f307c5ec..490afc03627e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -921,7 +921,7 @@ static int kimage_load_segment(struct kimage *image,   *   reinitialize them.   *   * - A machine specific part that includes the syscall number - *   and the copies the image to it's final destination.  And + *   and then copies the image to it's final destination.  And   *   jumps into the image at entry.   *   * kexec does not sync, or unmount filesystems so if you need diff --git a/kernel/kprobes.c b/kernel/kprobes.c index a0d367a49122..ceeadfcabb76 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2066,7 +2066,7 @@ static int __init init_kprobes(void)  {  	int i, err = 0;  	unsigned long offset = 0, size = 0; -	char *modname, namebuf[128]; +	char *modname, namebuf[KSYM_NAME_LEN];  	const char *symbol_name;  	void *addr;  	struct kprobe_blackpoint *kb; @@ -2192,7 +2192,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)  	const char *sym = NULL;  	unsigned int i = *(loff_t *) v;  	unsigned long offset = 0; -	char *modname, namebuf[128]; +	char *modname, namebuf[KSYM_NAME_LEN];  	head = &kprobe_table[i];  	preempt_disable(); diff --git a/kernel/kthread.c b/kernel/kthread.c index 760e86df8c20..b5ae3ee860a9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -33,7 +33,7 @@ struct kthread_create_info  	/* Result passed back to kthread_create() from kthreadd. */  	struct task_struct *result; -	struct completion done; +	struct completion *done;  	struct list_head list;  }; @@ -178,6 +178,7 @@ static int kthread(void *_create)  	struct kthread_create_info *create = _create;  	int (*threadfn)(void *data) = create->threadfn;  	void *data = create->data; +	struct completion *done;  	struct kthread self;  	int ret; @@ -187,10 +188,16 @@ static int kthread(void *_create)  	init_completion(&self.parked);  	current->vfork_done = &self.exited; +	/* If user was SIGKILLed, I release the structure. */ +	done = xchg(&create->done, NULL); +	if (!done) { +		kfree(create); +		do_exit(-EINTR); +	}  	/* OK, tell user we're spawned, wait for stop or wakeup */  	__set_current_state(TASK_UNINTERRUPTIBLE);  	create->result = current; -	complete(&create->done); +	complete(done);  	schedule();  	ret = -EINTR; @@ -223,8 +230,15 @@ static void create_kthread(struct kthread_create_info *create)  	/* We want our own signal handler (we take no signals by default). */  	pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);  	if (pid < 0) { +		/* If user was SIGKILLed, I release the structure. */ +		struct completion *done = xchg(&create->done, NULL); + +		if (!done) { +			kfree(create); +			return; +		}  		create->result = ERR_PTR(pid); -		complete(&create->done); +		complete(done);  	}  } @@ -255,36 +269,59 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),  					   const char namefmt[],  					   ...)  { -	struct kthread_create_info create; - -	create.threadfn = threadfn; -	create.data = data; -	create.node = node; -	init_completion(&create.done); +	DECLARE_COMPLETION_ONSTACK(done); +	struct task_struct *task; +	struct kthread_create_info *create = kmalloc(sizeof(*create), +						     GFP_KERNEL); + +	if (!create) +		return ERR_PTR(-ENOMEM); +	create->threadfn = threadfn; +	create->data = data; +	create->node = node; +	create->done = &done;  	spin_lock(&kthread_create_lock); -	list_add_tail(&create.list, &kthread_create_list); +	list_add_tail(&create->list, &kthread_create_list);  	spin_unlock(&kthread_create_lock);  	wake_up_process(kthreadd_task); -	wait_for_completion(&create.done); - -	if (!IS_ERR(create.result)) { +	/* +	 * Wait for completion in killable state, for I might be chosen by +	 * the OOM killer while kthreadd is trying to allocate memory for +	 * new kernel thread. +	 */ +	if (unlikely(wait_for_completion_killable(&done))) { +		/* +		 * If I was SIGKILLed before kthreadd (or new kernel thread) +		 * calls complete(), leave the cleanup of this structure to +		 * that thread. +		 */ +		if (xchg(&create->done, NULL)) +			return ERR_PTR(-ENOMEM); +		/* +		 * kthreadd (or new kernel thread) will call complete() +		 * shortly. +		 */ +		wait_for_completion(&done); +	} +	task = create->result; +	if (!IS_ERR(task)) {  		static const struct sched_param param = { .sched_priority = 0 };  		va_list args;  		va_start(args, namefmt); -		vsnprintf(create.result->comm, sizeof(create.result->comm), -			  namefmt, args); +		vsnprintf(task->comm, sizeof(task->comm), namefmt, args);  		va_end(args);  		/*  		 * root may have changed our (kthreadd's) priority or CPU mask.  		 * The kernel thread should not inherit these properties.  		 */ -		sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); -		set_cpus_allowed_ptr(create.result, cpu_all_mask); +		sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); +		set_cpus_allowed_ptr(task, cpu_all_mask);  	} -	return create.result; +	kfree(create); +	return task;  }  EXPORT_SYMBOL(kthread_create_on_node); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile new file mode 100644 index 000000000000..baab8e5e7f66 --- /dev/null +++ b/kernel/locking/Makefile @@ -0,0 +1,25 @@ + +obj-y += mutex.o semaphore.o rwsem.o lglock.o + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg +endif + +obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +obj-$(CONFIG_LOCKDEP) += lockdep.o +ifeq ($(CONFIG_PROC_FS),y) +obj-$(CONFIG_LOCKDEP) += lockdep_proc.o +endif +obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_PROVE_LOCKING) += spinlock.o +obj-$(CONFIG_RT_MUTEXES) += rtmutex.o +obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o +obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o +obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o +obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o diff --git a/kernel/lglock.c b/kernel/locking/lglock.c index 86ae2aebf004..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/locking/lglock.c diff --git a/kernel/lockdep.c b/kernel/locking/lockdep.c index e16c45b9ee77..576ba756a32d 100644 --- a/kernel/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1232,7 +1232,7 @@ static int noop_count(struct lock_list *entry, void *data)  	return 0;  } -unsigned long __lockdep_count_forward_deps(struct lock_list *this) +static unsigned long __lockdep_count_forward_deps(struct lock_list *this)  {  	unsigned long  count = 0;  	struct lock_list *uninitialized_var(target_entry); @@ -1258,7 +1258,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)  	return ret;  } -unsigned long __lockdep_count_backward_deps(struct lock_list *this) +static unsigned long __lockdep_count_backward_deps(struct lock_list *this)  {  	unsigned long  count = 0;  	struct lock_list *uninitialized_var(target_entry); @@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)  	printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",  	       !rcu_lockdep_current_cpu_online()  			? "RCU used illegally from offline CPU!\n" -			: rcu_is_cpu_idle() +			: !rcu_is_watching()  				? "RCU used illegally from idle CPU!\n"  				: "",  	       rcu_scheduler_active, debug_locks); @@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)  	 * So complain bitterly if someone does call rcu_read_lock(),  	 * rcu_read_lock_bh() and so on from extended quiescent states.  	 */ -	if (rcu_is_cpu_idle()) +	if (!rcu_is_watching())  		printk("RCU used illegally from extended quiescent state!\n");  	lockdep_print_held_locks(curr); diff --git a/kernel/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 4f560cfedc8f..4f560cfedc8f 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h diff --git a/kernel/lockdep_proc.c b/kernel/locking/lockdep_proc.c index b2c71c5873e4..ef43ac4bafb5 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -421,6 +421,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)  	seq_time(m, lt->min);  	seq_time(m, lt->max);  	seq_time(m, lt->total); +	seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);  }  static void seq_stats(struct seq_file *m, struct lock_stat_data *data) @@ -518,20 +519,20 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)  	}  	if (i) {  		seq_puts(m, "\n"); -		seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); +		seq_line(m, '.', 0, 40 + 1 + 12 * (14 + 1));  		seq_puts(m, "\n");  	}  }  static void seq_header(struct seq_file *m)  { -	seq_printf(m, "lock_stat version 0.3\n"); +	seq_puts(m, "lock_stat version 0.4\n");  	if (unlikely(!debug_locks))  		seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); -	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); -	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " +	seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1)); +	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s "  			"%14s %14s\n",  			"class name",  			"con-bounces", @@ -539,12 +540,14 @@ static void seq_header(struct seq_file *m)  			"waittime-min",  			"waittime-max",  			"waittime-total", +			"waittime-avg",  			"acq-bounces",  			"acquisitions",  			"holdtime-min",  			"holdtime-max", -			"holdtime-total"); -	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); +			"holdtime-total", +			"holdtime-avg"); +	seq_line(m, '-', 0, 40 + 1 + 12 * (14 + 1));  	seq_printf(m, "\n");  } diff --git a/kernel/lockdep_states.h b/kernel/locking/lockdep_states.h index 995b0cc2b84c..995b0cc2b84c 100644 --- a/kernel/lockdep_states.h +++ b/kernel/locking/lockdep_states.h diff --git a/kernel/mutex-debug.c b/kernel/locking/mutex-debug.c index 7e3443fe1f48..7e3443fe1f48 100644 --- a/kernel/mutex-debug.c +++ b/kernel/locking/mutex-debug.c diff --git a/kernel/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..0799fd3e4cfa 100644 --- a/kernel/mutex-debug.h +++ b/kernel/locking/mutex-debug.h diff --git a/kernel/mutex.c b/kernel/locking/mutex.c index 6d647aedffea..4dd6e4c219de 100644 --- a/kernel/mutex.c +++ b/kernel/locking/mutex.c @@ -1,5 +1,5 @@  /* - * kernel/mutex.c + * kernel/locking/mutex.c   *   * Mutexes: blocking mutual exclusion locks   * @@ -410,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,  static __always_inline int __sched  __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		    struct lockdep_map *nest_lock, unsigned long ip, -		    struct ww_acquire_ctx *ww_ctx) +		    struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)  {  	struct task_struct *task = current;  	struct mutex_waiter waiter; @@ -450,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		struct task_struct *owner;  		struct mspin_node  node; -		if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { +		if (use_ww_ctx && ww_ctx->acquired > 0) {  			struct ww_mutex *ww;  			ww = container_of(lock, struct ww_mutex, base); @@ -480,7 +480,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		if ((atomic_read(&lock->count) == 1) &&  		    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {  			lock_acquired(&lock->dep_map, ip); -			if (!__builtin_constant_p(ww_ctx == NULL)) { +			if (use_ww_ctx) {  				struct ww_mutex *ww;  				ww = container_of(lock, struct ww_mutex, base); @@ -551,7 +551,7 @@ slowpath:  			goto err;  		} -		if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { +		if (use_ww_ctx && ww_ctx->acquired > 0) {  			ret = __mutex_lock_check_stamp(lock, ww_ctx);  			if (ret)  				goto err; @@ -575,7 +575,7 @@ skip_wait:  	lock_acquired(&lock->dep_map, ip);  	mutex_set_owner(lock); -	if (!__builtin_constant_p(ww_ctx == NULL)) { +	if (use_ww_ctx) {  		struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);  		struct mutex_waiter *cur; @@ -615,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)  {  	might_sleep();  	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, -			    subclass, NULL, _RET_IP_, NULL); +			    subclass, NULL, _RET_IP_, NULL, 0);  }  EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -625,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)  {  	might_sleep();  	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, -			    0, nest, _RET_IP_, NULL); +			    0, nest, _RET_IP_, NULL, 0);  }  EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); @@ -635,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)  {  	might_sleep();  	return __mutex_lock_common(lock, TASK_KILLABLE, -				   subclass, NULL, _RET_IP_, NULL); +				   subclass, NULL, _RET_IP_, NULL, 0);  }  EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -644,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)  {  	might_sleep();  	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, -				   subclass, NULL, _RET_IP_, NULL); +				   subclass, NULL, _RET_IP_, NULL, 0);  }  EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -682,7 +682,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)  	might_sleep();  	ret =  __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, -				   0, &ctx->dep_map, _RET_IP_, ctx); +				   0, &ctx->dep_map, _RET_IP_, ctx, 1);  	if (!ret && ctx->acquired > 1)  		return ww_mutex_deadlock_injection(lock, ctx); @@ -697,7 +697,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)  	might_sleep();  	ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, -				  0, &ctx->dep_map, _RET_IP_, ctx); +				  0, &ctx->dep_map, _RET_IP_, ctx, 1);  	if (!ret && ctx->acquired > 1)  		return ww_mutex_deadlock_injection(lock, ctx); @@ -809,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count)  	struct mutex *lock = container_of(lock_count, struct mutex, count);  	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, -			    NULL, _RET_IP_, NULL); +			    NULL, _RET_IP_, NULL, 0);  }  static noinline int __sched  __mutex_lock_killable_slowpath(struct mutex *lock)  {  	return __mutex_lock_common(lock, TASK_KILLABLE, 0, -				   NULL, _RET_IP_, NULL); +				   NULL, _RET_IP_, NULL, 0);  }  static noinline int __sched  __mutex_lock_interruptible_slowpath(struct mutex *lock)  {  	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, -				   NULL, _RET_IP_, NULL); +				   NULL, _RET_IP_, NULL, 0);  }  static noinline int __sched  __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)  {  	return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, -				   NULL, _RET_IP_, ctx); +				   NULL, _RET_IP_, ctx, 1);  }  static noinline int __sched @@ -838,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,  					    struct ww_acquire_ctx *ctx)  {  	return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, -				   NULL, _RET_IP_, ctx); +				   NULL, _RET_IP_, ctx, 1);  }  #endif diff --git a/kernel/mutex.h b/kernel/locking/mutex.h index 4115fbf83b12..4115fbf83b12 100644 --- a/kernel/mutex.h +++ b/kernel/locking/mutex.h diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c new file mode 100644 index 000000000000..652a8ee8efe9 --- /dev/null +++ b/kernel/locking/percpu-rwsem.c @@ -0,0 +1,165 @@ +#include <linux/atomic.h> +#include <linux/rwsem.h> +#include <linux/percpu.h> +#include <linux/wait.h> +#include <linux/lockdep.h> +#include <linux/percpu-rwsem.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> +#include <linux/errno.h> + +int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, +			const char *name, struct lock_class_key *rwsem_key) +{ +	brw->fast_read_ctr = alloc_percpu(int); +	if (unlikely(!brw->fast_read_ctr)) +		return -ENOMEM; + +	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ +	__init_rwsem(&brw->rw_sem, name, rwsem_key); +	atomic_set(&brw->write_ctr, 0); +	atomic_set(&brw->slow_read_ctr, 0); +	init_waitqueue_head(&brw->write_waitq); +	return 0; +} + +void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +{ +	free_percpu(brw->fast_read_ctr); +	brw->fast_read_ctr = NULL; /* catch use after free bugs */ +} + +/* + * This is the fast-path for down_read/up_read, it only needs to ensure + * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the + * fast per-cpu counter. The writer uses synchronize_sched_expedited() to + * serialize with the preempt-disabled section below. + * + * The nontrivial part is that we should guarantee acquire/release semantics + * in case when + * + *	R_W: down_write() comes after up_read(), the writer should see all + *	     changes done by the reader + * or + *	W_R: down_read() comes after up_write(), the reader should see all + *	     changes done by the writer + * + * If this helper fails the callers rely on the normal rw_semaphore and + * atomic_dec_and_test(), so in this case we have the necessary barriers. + * + * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or + * __this_cpu_add() below can be reordered with any LOAD/STORE done by the + * reader inside the critical section. See the comments in down_write and + * up_write below. + */ +static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) +{ +	bool success = false; + +	preempt_disable(); +	if (likely(!atomic_read(&brw->write_ctr))) { +		__this_cpu_add(*brw->fast_read_ctr, val); +		success = true; +	} +	preempt_enable(); + +	return success; +} + +/* + * Like the normal down_read() this is not recursive, the writer can + * come after the first percpu_down_read() and create the deadlock. + * + * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, + * percpu_up_read() does rwsem_release(). This pairs with the usage + * of ->rw_sem in percpu_down/up_write(). + */ +void percpu_down_read(struct percpu_rw_semaphore *brw) +{ +	might_sleep(); +	if (likely(update_fast_ctr(brw, +1))) { +		rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); +		return; +	} + +	down_read(&brw->rw_sem); +	atomic_inc(&brw->slow_read_ctr); +	/* avoid up_read()->rwsem_release() */ +	__up_read(&brw->rw_sem); +} + +void percpu_up_read(struct percpu_rw_semaphore *brw) +{ +	rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); + +	if (likely(update_fast_ctr(brw, -1))) +		return; + +	/* false-positive is possible but harmless */ +	if (atomic_dec_and_test(&brw->slow_read_ctr)) +		wake_up_all(&brw->write_waitq); +} + +static int clear_fast_ctr(struct percpu_rw_semaphore *brw) +{ +	unsigned int sum = 0; +	int cpu; + +	for_each_possible_cpu(cpu) { +		sum += per_cpu(*brw->fast_read_ctr, cpu); +		per_cpu(*brw->fast_read_ctr, cpu) = 0; +	} + +	return sum; +} + +/* + * A writer increments ->write_ctr to force the readers to switch to the + * slow mode, note the atomic_read() check in update_fast_ctr(). + * + * After that the readers can only inc/dec the slow ->slow_read_ctr counter, + * ->fast_read_ctr is stable. Once the writer moves its sum into the slow + * counter it represents the number of active readers. + * + * Finally the writer takes ->rw_sem for writing and blocks the new readers, + * then waits until the slow counter becomes zero. + */ +void percpu_down_write(struct percpu_rw_semaphore *brw) +{ +	/* tell update_fast_ctr() there is a pending writer */ +	atomic_inc(&brw->write_ctr); +	/* +	 * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read +	 *    so that update_fast_ctr() can't succeed. +	 * +	 * 2. Ensures we see the result of every previous this_cpu_add() in +	 *    update_fast_ctr(). +	 * +	 * 3. Ensures that if any reader has exited its critical section via +	 *    fast-path, it executes a full memory barrier before we return. +	 *    See R_W case in the comment above update_fast_ctr(). +	 */ +	synchronize_sched_expedited(); + +	/* exclude other writers, and block the new readers completely */ +	down_write(&brw->rw_sem); + +	/* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ +	atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + +	/* wait for all readers to complete their percpu_up_read() */ +	wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); +} + +void percpu_up_write(struct percpu_rw_semaphore *brw) +{ +	/* release the lock, but the readers can't use the fast-path */ +	up_write(&brw->rw_sem); +	/* +	 * Insert the barrier before the next fast-path in down_read, +	 * see W_R case in the comment above update_fast_ctr(). +	 */ +	synchronize_sched_expedited(); +	/* the last writer unblocks update_fast_ctr() */ +	atomic_dec(&brw->write_ctr); +} diff --git a/kernel/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 13b243a323fa..13b243a323fa 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c diff --git a/kernel/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index 14193d596d78..14193d596d78 100644 --- a/kernel/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h diff --git a/kernel/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c index 1d96dd0d93c1..1d96dd0d93c1 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/locking/rtmutex-tester.c diff --git a/kernel/rtmutex.c b/kernel/locking/rtmutex.c index 0dd6aec1cb6a..0dd6aec1cb6a 100644 --- a/kernel/rtmutex.c +++ b/kernel/locking/rtmutex.c diff --git a/kernel/rtmutex.h b/kernel/locking/rtmutex.h index a1a1dd06421d..a1a1dd06421d 100644 --- a/kernel/rtmutex.h +++ b/kernel/locking/rtmutex.h diff --git a/kernel/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 53a66c85261b..53a66c85261b 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c new file mode 100644 index 000000000000..9be8a9144978 --- /dev/null +++ b/kernel/locking/rwsem-spinlock.c @@ -0,0 +1,296 @@ +/* rwsem-spinlock.c: R/W semaphores: contention handling functions for + * generic spinlock implementation + * + * Copyright (c) 2001   David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> + * - Derived also from comments by Linus + */ +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/export.h> + +enum rwsem_waiter_type { +	RWSEM_WAITING_FOR_WRITE, +	RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { +	struct list_head list; +	struct task_struct *task; +	enum rwsem_waiter_type type; +}; + +int rwsem_is_locked(struct rw_semaphore *sem) +{ +	int ret = 1; +	unsigned long flags; + +	if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { +		ret = (sem->activity != 0); +		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +	} +	return ret; +} +EXPORT_SYMBOL(rwsem_is_locked); + +/* + * initialise the semaphore + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, +		  struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	/* +	 * Make sure we are not reinitializing a held semaphore: +	 */ +	debug_check_no_locks_freed((void *)sem, sizeof(*sem)); +	lockdep_init_map(&sem->dep_map, name, key, 0); +#endif +	sem->activity = 0; +	raw_spin_lock_init(&sem->wait_lock); +	INIT_LIST_HEAD(&sem->wait_list); +} +EXPORT_SYMBOL(__init_rwsem); + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here, then: + *   - the 'active count' _reached_ zero + *   - the 'waiting count' is non-zero + * - the spinlock must be held by the caller + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if wakewrite is non-zero + */ +static inline struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) +{ +	struct rwsem_waiter *waiter; +	struct task_struct *tsk; +	int woken; + +	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + +	if (waiter->type == RWSEM_WAITING_FOR_WRITE) { +		if (wakewrite) +			/* Wake up a writer. Note that we do not grant it the +			 * lock - it will have to acquire it when it runs. */ +			wake_up_process(waiter->task); +		goto out; +	} + +	/* grant an infinite number of read locks to the front of the queue */ +	woken = 0; +	do { +		struct list_head *next = waiter->list.next; + +		list_del(&waiter->list); +		tsk = waiter->task; +		smp_mb(); +		waiter->task = NULL; +		wake_up_process(tsk); +		put_task_struct(tsk); +		woken++; +		if (next == &sem->wait_list) +			break; +		waiter = list_entry(next, struct rwsem_waiter, list); +	} while (waiter->type != RWSEM_WAITING_FOR_WRITE); + +	sem->activity += woken; + + out: +	return sem; +} + +/* + * wake a single writer + */ +static inline struct rw_semaphore * +__rwsem_wake_one_writer(struct rw_semaphore *sem) +{ +	struct rwsem_waiter *waiter; + +	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); +	wake_up_process(waiter->task); + +	return sem; +} + +/* + * get a read lock on the semaphore + */ +void __sched __down_read(struct rw_semaphore *sem) +{ +	struct rwsem_waiter waiter; +	struct task_struct *tsk; +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	if (sem->activity >= 0 && list_empty(&sem->wait_list)) { +		/* granted */ +		sem->activity++; +		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +		goto out; +	} + +	tsk = current; +	set_task_state(tsk, TASK_UNINTERRUPTIBLE); + +	/* set up my own style of waitqueue */ +	waiter.task = tsk; +	waiter.type = RWSEM_WAITING_FOR_READ; +	get_task_struct(tsk); + +	list_add_tail(&waiter.list, &sem->wait_list); + +	/* we don't need to touch the semaphore struct anymore */ +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + +	/* wait to be given the lock */ +	for (;;) { +		if (!waiter.task) +			break; +		schedule(); +		set_task_state(tsk, TASK_UNINTERRUPTIBLE); +	} + +	tsk->state = TASK_RUNNING; + out: +	; +} + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int __down_read_trylock(struct rw_semaphore *sem) +{ +	unsigned long flags; +	int ret = 0; + + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	if (sem->activity >= 0 && list_empty(&sem->wait_list)) { +		/* granted */ +		sem->activity++; +		ret = 1; +	} + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + +	return ret; +} + +/* + * get a write lock on the semaphore + */ +void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) +{ +	struct rwsem_waiter waiter; +	struct task_struct *tsk; +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	/* set up my own style of waitqueue */ +	tsk = current; +	waiter.task = tsk; +	waiter.type = RWSEM_WAITING_FOR_WRITE; +	list_add_tail(&waiter.list, &sem->wait_list); + +	/* wait for someone to release the lock */ +	for (;;) { +		/* +		 * That is the key to support write lock stealing: allows the +		 * task already on CPU to get the lock soon rather than put +		 * itself into sleep and waiting for system woke it or someone +		 * else in the head of the wait list up. +		 */ +		if (sem->activity == 0) +			break; +		set_task_state(tsk, TASK_UNINTERRUPTIBLE); +		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +		schedule(); +		raw_spin_lock_irqsave(&sem->wait_lock, flags); +	} +	/* got the lock */ +	sem->activity = -1; +	list_del(&waiter.list); + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +void __sched __down_write(struct rw_semaphore *sem) +{ +	__down_write_nested(sem, 0); +} + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +int __down_write_trylock(struct rw_semaphore *sem) +{ +	unsigned long flags; +	int ret = 0; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	if (sem->activity == 0) { +		/* got the lock */ +		sem->activity = -1; +		ret = 1; +	} + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + +	return ret; +} + +/* + * release a read lock on the semaphore + */ +void __up_read(struct rw_semaphore *sem) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	if (--sem->activity == 0 && !list_empty(&sem->wait_list)) +		sem = __rwsem_wake_one_writer(sem); + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +/* + * release a write lock on the semaphore + */ +void __up_write(struct rw_semaphore *sem) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	sem->activity = 0; +	if (!list_empty(&sem->wait_list)) +		sem = __rwsem_do_wake(sem, 1); + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + +/* + * downgrade a write lock into a read lock + * - just wake up any readers at the front of the queue + */ +void __downgrade_write(struct rw_semaphore *sem) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	sem->activity = 1; +	if (!list_empty(&sem->wait_list)) +		sem = __rwsem_do_wake(sem, 0); + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); +} + diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c new file mode 100644 index 000000000000..19c5fa95e0b4 --- /dev/null +++ b/kernel/locking/rwsem-xadd.c @@ -0,0 +1,293 @@ +/* rwsem.c: R/W semaphores: contention handling functions + * + * Written by David Howells (dhowells@redhat.com). + * Derived from arch/i386/kernel/semaphore.c + * + * Writer lock-stealing by Alex Shi <alex.shi@intel.com> + * and Michel Lespinasse <walken@google.com> + */ +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/export.h> + +/* + * Initialize an rwsem: + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, +		  struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	/* +	 * Make sure we are not reinitializing a held semaphore: +	 */ +	debug_check_no_locks_freed((void *)sem, sizeof(*sem)); +	lockdep_init_map(&sem->dep_map, name, key, 0); +#endif +	sem->count = RWSEM_UNLOCKED_VALUE; +	raw_spin_lock_init(&sem->wait_lock); +	INIT_LIST_HEAD(&sem->wait_list); +} + +EXPORT_SYMBOL(__init_rwsem); + +enum rwsem_waiter_type { +	RWSEM_WAITING_FOR_WRITE, +	RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { +	struct list_head list; +	struct task_struct *task; +	enum rwsem_waiter_type type; +}; + +enum rwsem_wake_type { +	RWSEM_WAKE_ANY,		/* Wake whatever's at head of wait list */ +	RWSEM_WAKE_READERS,	/* Wake readers only */ +	RWSEM_WAKE_READ_OWNED	/* Waker thread holds the read lock */ +}; + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here from up_xxxx(), then: + *   - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) + *   - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) + * - there must be someone on the queue + * - the spinlock must be held by the caller + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only woken if downgrading is false + */ +static struct rw_semaphore * +__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +{ +	struct rwsem_waiter *waiter; +	struct task_struct *tsk; +	struct list_head *next; +	long oldcount, woken, loop, adjustment; + +	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); +	if (waiter->type == RWSEM_WAITING_FOR_WRITE) { +		if (wake_type == RWSEM_WAKE_ANY) +			/* Wake writer at the front of the queue, but do not +			 * grant it the lock yet as we want other writers +			 * to be able to steal it.  Readers, on the other hand, +			 * will block as they will notice the queued writer. +			 */ +			wake_up_process(waiter->task); +		goto out; +	} + +	/* Writers might steal the lock before we grant it to the next reader. +	 * We prefer to do the first reader grant before counting readers +	 * so we can bail out early if a writer stole the lock. +	 */ +	adjustment = 0; +	if (wake_type != RWSEM_WAKE_READ_OWNED) { +		adjustment = RWSEM_ACTIVE_READ_BIAS; + try_reader_grant: +		oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; +		if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { +			/* A writer stole the lock. Undo our reader grant. */ +			if (rwsem_atomic_update(-adjustment, sem) & +						RWSEM_ACTIVE_MASK) +				goto out; +			/* Last active locker left. Retry waking readers. */ +			goto try_reader_grant; +		} +	} + +	/* Grant an infinite number of read locks to the readers at the front +	 * of the queue.  Note we increment the 'active part' of the count by +	 * the number of readers before waking any processes up. +	 */ +	woken = 0; +	do { +		woken++; + +		if (waiter->list.next == &sem->wait_list) +			break; + +		waiter = list_entry(waiter->list.next, +					struct rwsem_waiter, list); + +	} while (waiter->type != RWSEM_WAITING_FOR_WRITE); + +	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; +	if (waiter->type != RWSEM_WAITING_FOR_WRITE) +		/* hit end of list above */ +		adjustment -= RWSEM_WAITING_BIAS; + +	if (adjustment) +		rwsem_atomic_add(adjustment, sem); + +	next = sem->wait_list.next; +	loop = woken; +	do { +		waiter = list_entry(next, struct rwsem_waiter, list); +		next = waiter->list.next; +		tsk = waiter->task; +		smp_mb(); +		waiter->task = NULL; +		wake_up_process(tsk); +		put_task_struct(tsk); +	} while (--loop); + +	sem->wait_list.next = next; +	next->prev = &sem->wait_list; + + out: +	return sem; +} + +/* + * wait for the read lock to be granted + */ +struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) +{ +	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; +	struct rwsem_waiter waiter; +	struct task_struct *tsk = current; + +	/* set up my own style of waitqueue */ +	waiter.task = tsk; +	waiter.type = RWSEM_WAITING_FOR_READ; +	get_task_struct(tsk); + +	raw_spin_lock_irq(&sem->wait_lock); +	if (list_empty(&sem->wait_list)) +		adjustment += RWSEM_WAITING_BIAS; +	list_add_tail(&waiter.list, &sem->wait_list); + +	/* we're now waiting on the lock, but no longer actively locking */ +	count = rwsem_atomic_update(adjustment, sem); + +	/* If there are no active locks, wake the front queued process(es). +	 * +	 * If there are no writers and we are first in the queue, +	 * wake our own waiter to join the existing active readers ! +	 */ +	if (count == RWSEM_WAITING_BIAS || +	    (count > RWSEM_WAITING_BIAS && +	     adjustment != -RWSEM_ACTIVE_READ_BIAS)) +		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + +	raw_spin_unlock_irq(&sem->wait_lock); + +	/* wait to be given the lock */ +	while (true) { +		set_task_state(tsk, TASK_UNINTERRUPTIBLE); +		if (!waiter.task) +			break; +		schedule(); +	} + +	tsk->state = TASK_RUNNING; + +	return sem; +} + +/* + * wait until we successfully acquire the write lock + */ +struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) +{ +	long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; +	struct rwsem_waiter waiter; +	struct task_struct *tsk = current; + +	/* set up my own style of waitqueue */ +	waiter.task = tsk; +	waiter.type = RWSEM_WAITING_FOR_WRITE; + +	raw_spin_lock_irq(&sem->wait_lock); +	if (list_empty(&sem->wait_list)) +		adjustment += RWSEM_WAITING_BIAS; +	list_add_tail(&waiter.list, &sem->wait_list); + +	/* we're now waiting on the lock, but no longer actively locking */ +	count = rwsem_atomic_update(adjustment, sem); + +	/* If there were already threads queued before us and there are no +	 * active writers, the lock must be read owned; so we try to wake +	 * any read locks that were queued ahead of us. */ +	if (count > RWSEM_WAITING_BIAS && +	    adjustment == -RWSEM_ACTIVE_WRITE_BIAS) +		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + +	/* wait until we successfully acquire the lock */ +	set_task_state(tsk, TASK_UNINTERRUPTIBLE); +	while (true) { +		if (!(count & RWSEM_ACTIVE_MASK)) { +			/* Try acquiring the write lock. */ +			count = RWSEM_ACTIVE_WRITE_BIAS; +			if (!list_is_singular(&sem->wait_list)) +				count += RWSEM_WAITING_BIAS; + +			if (sem->count == RWSEM_WAITING_BIAS && +			    cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == +							RWSEM_WAITING_BIAS) +				break; +		} + +		raw_spin_unlock_irq(&sem->wait_lock); + +		/* Block until there are no active lockers. */ +		do { +			schedule(); +			set_task_state(tsk, TASK_UNINTERRUPTIBLE); +		} while ((count = sem->count) & RWSEM_ACTIVE_MASK); + +		raw_spin_lock_irq(&sem->wait_lock); +	} + +	list_del(&waiter.list); +	raw_spin_unlock_irq(&sem->wait_lock); +	tsk->state = TASK_RUNNING; + +	return sem; +} + +/* + * handle waking up a waiter on the semaphore + * - up_read/up_write has decremented the active part of count if we come here + */ +struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	/* do nothing if list empty */ +	if (!list_empty(&sem->wait_list)) +		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + +	return sem; +} + +/* + * downgrade a write lock into a read lock + * - caller incremented waiting part of count and discovered it still negative + * - just wake up any readers at the front of the queue + */ +struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&sem->wait_lock, flags); + +	/* do nothing if list empty */ +	if (!list_empty(&sem->wait_list)) +		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + +	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + +	return sem; +} + +EXPORT_SYMBOL(rwsem_down_read_failed); +EXPORT_SYMBOL(rwsem_down_write_failed); +EXPORT_SYMBOL(rwsem_wake); +EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/rwsem.c b/kernel/locking/rwsem.c index cfff1435bdfb..cfff1435bdfb 100644 --- a/kernel/rwsem.c +++ b/kernel/locking/rwsem.c diff --git a/kernel/semaphore.c b/kernel/locking/semaphore.c index 6815171a4fff..6815171a4fff 100644 --- a/kernel/semaphore.c +++ b/kernel/locking/semaphore.c diff --git a/kernel/spinlock.c b/kernel/locking/spinlock.c index 4b082b5cac9e..4b082b5cac9e 100644 --- a/kernel/spinlock.c +++ b/kernel/locking/spinlock.c diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c new file mode 100644 index 000000000000..0374a596cffa --- /dev/null +++ b/kernel/locking/spinlock_debug.c @@ -0,0 +1,302 @@ +/* + * Copyright 2005, Red Hat, Inc., Ingo Molnar + * Released under the General Public License (GPL). + * + * This file contains the spinlock/rwlock implementations for + * DEBUG_SPINLOCK. + */ + +#include <linux/spinlock.h> +#include <linux/nmi.h> +#include <linux/interrupt.h> +#include <linux/debug_locks.h> +#include <linux/delay.h> +#include <linux/export.h> + +void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, +			  struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	/* +	 * Make sure we are not reinitializing a held lock: +	 */ +	debug_check_no_locks_freed((void *)lock, sizeof(*lock)); +	lockdep_init_map(&lock->dep_map, name, key, 0); +#endif +	lock->raw_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +	lock->magic = SPINLOCK_MAGIC; +	lock->owner = SPINLOCK_OWNER_INIT; +	lock->owner_cpu = -1; +} + +EXPORT_SYMBOL(__raw_spin_lock_init); + +void __rwlock_init(rwlock_t *lock, const char *name, +		   struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +	/* +	 * Make sure we are not reinitializing a held lock: +	 */ +	debug_check_no_locks_freed((void *)lock, sizeof(*lock)); +	lockdep_init_map(&lock->dep_map, name, key, 0); +#endif +	lock->raw_lock = (arch_rwlock_t) __ARCH_RW_LOCK_UNLOCKED; +	lock->magic = RWLOCK_MAGIC; +	lock->owner = SPINLOCK_OWNER_INIT; +	lock->owner_cpu = -1; +} + +EXPORT_SYMBOL(__rwlock_init); + +static void spin_dump(raw_spinlock_t *lock, const char *msg) +{ +	struct task_struct *owner = NULL; + +	if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) +		owner = lock->owner; +	printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n", +		msg, raw_smp_processor_id(), +		current->comm, task_pid_nr(current)); +	printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, " +			".owner_cpu: %d\n", +		lock, lock->magic, +		owner ? owner->comm : "<none>", +		owner ? task_pid_nr(owner) : -1, +		lock->owner_cpu); +	dump_stack(); +} + +static void spin_bug(raw_spinlock_t *lock, const char *msg) +{ +	if (!debug_locks_off()) +		return; + +	spin_dump(lock, msg); +} + +#define SPIN_BUG_ON(cond, lock, msg) if (unlikely(cond)) spin_bug(lock, msg) + +static inline void +debug_spin_lock_before(raw_spinlock_t *lock) +{ +	SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); +	SPIN_BUG_ON(lock->owner == current, lock, "recursion"); +	SPIN_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), +							lock, "cpu recursion"); +} + +static inline void debug_spin_lock_after(raw_spinlock_t *lock) +{ +	lock->owner_cpu = raw_smp_processor_id(); +	lock->owner = current; +} + +static inline void debug_spin_unlock(raw_spinlock_t *lock) +{ +	SPIN_BUG_ON(lock->magic != SPINLOCK_MAGIC, lock, "bad magic"); +	SPIN_BUG_ON(!raw_spin_is_locked(lock), lock, "already unlocked"); +	SPIN_BUG_ON(lock->owner != current, lock, "wrong owner"); +	SPIN_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), +							lock, "wrong CPU"); +	lock->owner = SPINLOCK_OWNER_INIT; +	lock->owner_cpu = -1; +} + +static void __spin_lock_debug(raw_spinlock_t *lock) +{ +	u64 i; +	u64 loops = loops_per_jiffy * HZ; + +	for (i = 0; i < loops; i++) { +		if (arch_spin_trylock(&lock->raw_lock)) +			return; +		__delay(1); +	} +	/* lockup suspected: */ +	spin_dump(lock, "lockup suspected"); +#ifdef CONFIG_SMP +	trigger_all_cpu_backtrace(); +#endif + +	/* +	 * The trylock above was causing a livelock.  Give the lower level arch +	 * specific lock code a chance to acquire the lock. We have already +	 * printed a warning/backtrace at this point. The non-debug arch +	 * specific code might actually succeed in acquiring the lock.  If it is +	 * not successful, the end-result is the same - there is no forward +	 * progress. +	 */ +	arch_spin_lock(&lock->raw_lock); +} + +void do_raw_spin_lock(raw_spinlock_t *lock) +{ +	debug_spin_lock_before(lock); +	if (unlikely(!arch_spin_trylock(&lock->raw_lock))) +		__spin_lock_debug(lock); +	debug_spin_lock_after(lock); +} + +int do_raw_spin_trylock(raw_spinlock_t *lock) +{ +	int ret = arch_spin_trylock(&lock->raw_lock); + +	if (ret) +		debug_spin_lock_after(lock); +#ifndef CONFIG_SMP +	/* +	 * Must not happen on UP: +	 */ +	SPIN_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif +	return ret; +} + +void do_raw_spin_unlock(raw_spinlock_t *lock) +{ +	debug_spin_unlock(lock); +	arch_spin_unlock(&lock->raw_lock); +} + +static void rwlock_bug(rwlock_t *lock, const char *msg) +{ +	if (!debug_locks_off()) +		return; + +	printk(KERN_EMERG "BUG: rwlock %s on CPU#%d, %s/%d, %p\n", +		msg, raw_smp_processor_id(), current->comm, +		task_pid_nr(current), lock); +	dump_stack(); +} + +#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg) + +#if 0		/* __write_lock_debug() can lock up - maybe this can too? */ +static void __read_lock_debug(rwlock_t *lock) +{ +	u64 i; +	u64 loops = loops_per_jiffy * HZ; +	int print_once = 1; + +	for (;;) { +		for (i = 0; i < loops; i++) { +			if (arch_read_trylock(&lock->raw_lock)) +				return; +			__delay(1); +		} +		/* lockup suspected: */ +		if (print_once) { +			print_once = 0; +			printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, " +					"%s/%d, %p\n", +				raw_smp_processor_id(), current->comm, +				current->pid, lock); +			dump_stack(); +		} +	} +} +#endif + +void do_raw_read_lock(rwlock_t *lock) +{ +	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); +	arch_read_lock(&lock->raw_lock); +} + +int do_raw_read_trylock(rwlock_t *lock) +{ +	int ret = arch_read_trylock(&lock->raw_lock); + +#ifndef CONFIG_SMP +	/* +	 * Must not happen on UP: +	 */ +	RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif +	return ret; +} + +void do_raw_read_unlock(rwlock_t *lock) +{ +	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); +	arch_read_unlock(&lock->raw_lock); +} + +static inline void debug_write_lock_before(rwlock_t *lock) +{ +	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); +	RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); +	RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), +							lock, "cpu recursion"); +} + +static inline void debug_write_lock_after(rwlock_t *lock) +{ +	lock->owner_cpu = raw_smp_processor_id(); +	lock->owner = current; +} + +static inline void debug_write_unlock(rwlock_t *lock) +{ +	RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); +	RWLOCK_BUG_ON(lock->owner != current, lock, "wrong owner"); +	RWLOCK_BUG_ON(lock->owner_cpu != raw_smp_processor_id(), +							lock, "wrong CPU"); +	lock->owner = SPINLOCK_OWNER_INIT; +	lock->owner_cpu = -1; +} + +#if 0		/* This can cause lockups */ +static void __write_lock_debug(rwlock_t *lock) +{ +	u64 i; +	u64 loops = loops_per_jiffy * HZ; +	int print_once = 1; + +	for (;;) { +		for (i = 0; i < loops; i++) { +			if (arch_write_trylock(&lock->raw_lock)) +				return; +			__delay(1); +		} +		/* lockup suspected: */ +		if (print_once) { +			print_once = 0; +			printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, " +					"%s/%d, %p\n", +				raw_smp_processor_id(), current->comm, +				current->pid, lock); +			dump_stack(); +		} +	} +} +#endif + +void do_raw_write_lock(rwlock_t *lock) +{ +	debug_write_lock_before(lock); +	arch_write_lock(&lock->raw_lock); +	debug_write_lock_after(lock); +} + +int do_raw_write_trylock(rwlock_t *lock) +{ +	int ret = arch_write_trylock(&lock->raw_lock); + +	if (ret) +		debug_write_lock_after(lock); +#ifndef CONFIG_SMP +	/* +	 * Must not happen on UP: +	 */ +	RWLOCK_BUG_ON(!ret, lock, "trylock failure on UP"); +#endif +	return ret; +} + +void do_raw_write_unlock(rwlock_t *lock) +{ +	debug_write_unlock(lock); +	arch_write_unlock(&lock->raw_lock); +} diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S deleted file mode 100644 index 4a9a86d12c8b..000000000000 --- a/kernel/modsign_certificate.S +++ /dev/null @@ -1,12 +0,0 @@ -#include <linux/export.h> - -#define GLOBAL(name)	\ -	.globl VMLINUX_SYMBOL(name);	\ -	VMLINUX_SYMBOL(name): - -	.section ".init.data","aw" - -GLOBAL(modsign_certificate_list) -	.incbin "signing_key.x509" -	.incbin "extra_certificates" -GLOBAL(modsign_certificate_list_end) diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c deleted file mode 100644 index 7cbd4507a7e6..000000000000 --- a/kernel/modsign_pubkey.c +++ /dev/null @@ -1,104 +0,0 @@ -/* Public keys for module signature verification - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/cred.h> -#include <linux/err.h> -#include <keys/asymmetric-type.h> -#include "module-internal.h" - -struct key *modsign_keyring; - -extern __initconst const u8 modsign_certificate_list[]; -extern __initconst const u8 modsign_certificate_list_end[]; - -/* - * We need to make sure ccache doesn't cache the .o file as it doesn't notice - * if modsign.pub changes. - */ -static __initconst const char annoy_ccache[] = __TIME__ "foo"; - -/* - * Load the compiled-in keys - */ -static __init int module_verify_init(void) -{ -	pr_notice("Initialise module verification\n"); - -	modsign_keyring = keyring_alloc(".module_sign", -					KUIDT_INIT(0), KGIDT_INIT(0), -					current_cred(), -					((KEY_POS_ALL & ~KEY_POS_SETATTR) | -					 KEY_USR_VIEW | KEY_USR_READ), -					KEY_ALLOC_NOT_IN_QUOTA, NULL); -	if (IS_ERR(modsign_keyring)) -		panic("Can't allocate module signing keyring\n"); - -	return 0; -} - -/* - * Must be initialised before we try and load the keys into the keyring. - */ -device_initcall(module_verify_init); - -/* - * Load the compiled-in keys - */ -static __init int load_module_signing_keys(void) -{ -	key_ref_t key; -	const u8 *p, *end; -	size_t plen; - -	pr_notice("Loading module verification certificates\n"); - -	end = modsign_certificate_list_end; -	p = modsign_certificate_list; -	while (p < end) { -		/* Each cert begins with an ASN.1 SEQUENCE tag and must be more -		 * than 256 bytes in size. -		 */ -		if (end - p < 4) -			goto dodgy_cert; -		if (p[0] != 0x30 && -		    p[1] != 0x82) -			goto dodgy_cert; -		plen = (p[2] << 8) | p[3]; -		plen += 4; -		if (plen > end - p) -			goto dodgy_cert; - -		key = key_create_or_update(make_key_ref(modsign_keyring, 1), -					   "asymmetric", -					   NULL, -					   p, -					   plen, -					   (KEY_POS_ALL & ~KEY_POS_SETATTR) | -					   KEY_USR_VIEW, -					   KEY_ALLOC_NOT_IN_QUOTA); -		if (IS_ERR(key)) -			pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", -			       PTR_ERR(key)); -		else -			pr_notice("MODSIGN: Loaded cert '%s'\n", -				  key_ref_to_ptr(key)->description); -		p += plen; -	} - -	return 0; - -dodgy_cert: -	pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); -	return 0; -} -late_initcall(load_module_signing_keys); diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 24f9247b7d02..915e123a430f 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -9,6 +9,4 @@   * 2 of the Licence, or (at your option) any later version.   */ -extern struct key *modsign_keyring; -  extern int mod_verify_sig(const void *mod, unsigned long *_modlen); diff --git a/kernel/module.c b/kernel/module.c index dc582749fa13..f5a3b1e8ec51 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -378,23 +378,21 @@ static bool check_symbol(const struct symsearch *syms,  		if (syms->licence == GPL_ONLY)  			return false;  		if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { -			printk(KERN_WARNING "Symbol %s is being used " -			       "by a non-GPL module, which will not " -			       "be allowed in the future\n", fsa->name); +			pr_warn("Symbol %s is being used by a non-GPL module, " +				"which will not be allowed in the future\n", +				fsa->name);  		}  	}  #ifdef CONFIG_UNUSED_SYMBOLS  	if (syms->unused && fsa->warn) { -		printk(KERN_WARNING "Symbol %s is marked as UNUSED, " -		       "however this module is using it.\n", fsa->name); -		printk(KERN_WARNING -		       "This symbol will go away in the future.\n"); -		printk(KERN_WARNING -		       "Please evalute if this is the right api to use and if " -		       "it really is, submit a report the linux kernel " -		       "mailinglist together with submitting your code for " -		       "inclusion.\n"); +		pr_warn("Symbol %s is marked as UNUSED, however this module is " +			"using it.\n", fsa->name); +		pr_warn("This symbol will go away in the future.\n"); +		pr_warn("Please evalute if this is the right api to use and if " +			"it really is, submit a report the linux kernel " +			"mailinglist together with submitting your code for " +			"inclusion.\n");  	}  #endif @@ -492,16 +490,15 @@ static int percpu_modalloc(struct module *mod, struct load_info *info)  		return 0;  	if (align > PAGE_SIZE) { -		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", -		       mod->name, align, PAGE_SIZE); +		pr_warn("%s: per-cpu alignment %li > %li\n", +			mod->name, align, PAGE_SIZE);  		align = PAGE_SIZE;  	}  	mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);  	if (!mod->percpu) { -		printk(KERN_WARNING -		       "%s: Could not allocate %lu bytes percpu data\n", -		       mod->name, (unsigned long)pcpusec->sh_size); +		pr_warn("%s: Could not allocate %lu bytes percpu data\n", +			mod->name, (unsigned long)pcpusec->sh_size);  		return -ENOMEM;  	}  	mod->percpu_size = pcpusec->sh_size; @@ -644,8 +641,6 @@ static int module_unload_init(struct module *mod)  	/* Hold reference count during initialization. */  	__this_cpu_write(mod->refptr->incs, 1); -	/* Backwards compatibility macros put refcount during init. */ -	mod->waiter = current;  	return 0;  } @@ -679,7 +674,7 @@ static int add_module_usage(struct module *a, struct module *b)  	pr_debug("Allocating new usage for %s.\n", a->name);  	use = kmalloc(sizeof(*use), GFP_ATOMIC);  	if (!use) { -		printk(KERN_WARNING "%s: out of memory loading\n", a->name); +		pr_warn("%s: out of memory loading\n", a->name);  		return -ENOMEM;  	} @@ -771,16 +766,9 @@ static int __try_stop_module(void *_sref)  static int try_stop_module(struct module *mod, int flags, int *forced)  { -	if (flags & O_NONBLOCK) { -		struct stopref sref = { mod, flags, forced }; +	struct stopref sref = { mod, flags, forced }; -		return stop_machine(__try_stop_module, &sref, NULL); -	} else { -		/* We don't need to stop the machine for this. */ -		mod->state = MODULE_STATE_GOING; -		synchronize_sched(); -		return 0; -	} +	return stop_machine(__try_stop_module, &sref, NULL);  }  unsigned long module_refcount(struct module *mod) @@ -813,21 +801,6 @@ EXPORT_SYMBOL(module_refcount);  /* This exists whether we can unload or not */  static void free_module(struct module *mod); -static void wait_for_zero_refcount(struct module *mod) -{ -	/* Since we might sleep for some time, release the mutex first */ -	mutex_unlock(&module_mutex); -	for (;;) { -		pr_debug("Looking at refcount...\n"); -		set_current_state(TASK_UNINTERRUPTIBLE); -		if (module_refcount(mod) == 0) -			break; -		schedule(); -	} -	current->state = TASK_RUNNING; -	mutex_lock(&module_mutex); -} -  SYSCALL_DEFINE2(delete_module, const char __user *, name_user,  		unsigned int, flags)  { @@ -842,6 +815,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,  		return -EFAULT;  	name[MODULE_NAME_LEN-1] = '\0'; +	if (!(flags & O_NONBLOCK)) { +		printk(KERN_WARNING +		       "waiting module removal not supported: please upgrade"); +	} +  	if (mutex_lock_interruptible(&module_mutex) != 0)  		return -EINTR; @@ -859,8 +837,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,  	/* Doing init or already dying? */  	if (mod->state != MODULE_STATE_LIVE) { -		/* FIXME: if (force), slam module count and wake up -                   waiter --RR */ +		/* FIXME: if (force), slam module count damn the torpedoes */  		pr_debug("%s already dying\n", mod->name);  		ret = -EBUSY;  		goto out; @@ -876,18 +853,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,  		}  	} -	/* Set this up before setting mod->state */ -	mod->waiter = current; -  	/* Stop the machine so refcounts can't move and disable module. */  	ret = try_stop_module(mod, flags, &forced);  	if (ret != 0)  		goto out; -	/* Never wait if forced. */ -	if (!forced && module_refcount(mod) != 0) -		wait_for_zero_refcount(mod); -  	mutex_unlock(&module_mutex);  	/* Final destruction now no one is using it. */  	if (mod->exit != NULL) @@ -1005,9 +975,6 @@ void module_put(struct module *module)  		__this_cpu_inc(module->refptr->decs);  		trace_module_put(module, _RET_IP_); -		/* Maybe they're waiting for us to drop reference? */ -		if (unlikely(!module_is_live(module))) -			wake_up_process(module->waiter);  		preempt_enable();  	}  } @@ -1145,8 +1112,7 @@ static int try_to_force_load(struct module *mod, const char *reason)  {  #ifdef CONFIG_MODULE_FORCE_LOAD  	if (!test_taint(TAINT_FORCED_MODULE)) -		printk(KERN_WARNING "%s: %s: kernel tainted.\n", -		       mod->name, reason); +		pr_warn("%s: %s: kernel tainted.\n", mod->name, reason);  	add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_NOW_UNRELIABLE);  	return 0;  #else @@ -1199,8 +1165,7 @@ static int check_version(Elf_Shdr *sechdrs,  		goto bad_version;  	} -	printk(KERN_WARNING "%s: no symbol version for %s\n", -	       mod->name, symname); +	pr_warn("%s: no symbol version for %s\n", mod->name, symname);  	return 0;  bad_version: @@ -1309,8 +1274,8 @@ resolve_symbol_wait(struct module *mod,  			!IS_ERR(ksym = resolve_symbol(mod, info, name, owner))  			|| PTR_ERR(ksym) != -EBUSY,  					     30 * HZ) <= 0) { -		printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", -		       mod->name, owner); +		pr_warn("%s: gave up waiting for init of module %s.\n", +			mod->name, owner);  	}  	return ksym;  } @@ -1626,15 +1591,14 @@ static int mod_sysfs_init(struct module *mod)  	struct kobject *kobj;  	if (!module_sysfs_initialized) { -		printk(KERN_ERR "%s: module sysfs not initialized\n", -		       mod->name); +		pr_err("%s: module sysfs not initialized\n", mod->name);  		err = -EINVAL;  		goto out;  	}  	kobj = kset_find_obj(module_kset, mod->name);  	if (kobj) { -		printk(KERN_ERR "%s: module is already loaded\n", mod->name); +		pr_err("%s: module is already loaded\n", mod->name);  		kobject_put(kobj);  		err = -EINVAL;  		goto out; @@ -1961,8 +1925,7 @@ static int verify_export_symbols(struct module *mod)  	for (i = 0; i < ARRAY_SIZE(arr); i++) {  		for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {  			if (find_symbol(s->name, &owner, NULL, true, false)) { -				printk(KERN_ERR -				       "%s: exports duplicate symbol %s" +				pr_err("%s: exports duplicate symbol %s"  				       " (owned by %s)\n",  				       mod->name, s->name, module_name(owner));  				return -ENOEXEC; @@ -2013,8 +1976,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)  			if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)  				break; -			printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", -			       mod->name, name, PTR_ERR(ksym)); +			pr_warn("%s: Unknown symbol %s (err %li)\n", +				mod->name, name, PTR_ERR(ksym));  			ret = PTR_ERR(ksym) ?: -ENOENT;  			break; @@ -2168,8 +2131,8 @@ static void set_license(struct module *mod, const char *license)  	if (!license_is_gpl_compatible(license)) {  		if (!test_taint(TAINT_PROPRIETARY_MODULE)) -			printk(KERN_WARNING "%s: module license '%s' taints " -				"kernel.\n", mod->name, license); +			pr_warn("%s: module license '%s' taints kernel.\n", +				mod->name, license);  		add_taint_module(mod, TAINT_PROPRIETARY_MODULE,  				 LOCKDEP_NOW_UNRELIABLE);  	} @@ -2405,8 +2368,8 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)  		return;  #ifdef CONFIG_DYNAMIC_DEBUG  	if (ddebug_add_module(debug, num, debug->modname)) -		printk(KERN_ERR "dynamic debug error adding module: %s\n", -					debug->modname); +		pr_err("dynamic debug error adding module: %s\n", +			debug->modname);  #endif  } @@ -2619,8 +2582,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)  		Elf_Shdr *shdr = &info->sechdrs[i];  		if (shdr->sh_type != SHT_NOBITS  		    && info->len < shdr->sh_offset + shdr->sh_size) { -			printk(KERN_ERR "Module len %lu truncated\n", -			       info->len); +			pr_err("Module len %lu truncated\n", info->len);  			return -ENOEXEC;  		} @@ -2682,15 +2644,14 @@ static struct module *setup_load_info(struct load_info *info, int flags)  	info->index.mod = find_sec(info, ".gnu.linkonce.this_module");  	if (!info->index.mod) { -		printk(KERN_WARNING "No module found in object\n"); +		pr_warn("No module found in object\n");  		return ERR_PTR(-ENOEXEC);  	}  	/* This is temporary: point mod into copy of data. */  	mod = (void *)info->sechdrs[info->index.mod].sh_addr;  	if (info->index.sym == 0) { -		printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", -		       mod->name); +		pr_warn("%s: module has no symbols (stripped?)\n", mod->name);  		return ERR_PTR(-ENOEXEC);  	} @@ -2717,7 +2678,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)  		if (err)  			return err;  	} else if (!same_magic(modmagic, vermagic, info->index.vers)) { -		printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", +		pr_err("%s: version magic '%s' should be '%s'\n",  		       mod->name, modmagic, vermagic);  		return -ENOEXEC;  	} @@ -2727,9 +2688,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)  	if (get_modinfo(info, "staging")) {  		add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); -		printk(KERN_WARNING "%s: module is from the staging directory," -		       " the quality is unknown, you have been warned.\n", -		       mod->name); +		pr_warn("%s: module is from the staging directory, the quality " +			"is unknown, you have been warned.\n", mod->name);  	}  	/* Set up license info based on the info section */ @@ -2738,7 +2698,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)  	return 0;  } -static void find_module_sections(struct module *mod, struct load_info *info) +static int find_module_sections(struct module *mod, struct load_info *info)  {  	mod->kp = section_objs(info, "__param",  			       sizeof(*mod->kp), &mod->num_kp); @@ -2768,6 +2728,18 @@ static void find_module_sections(struct module *mod, struct load_info *info)  #ifdef CONFIG_CONSTRUCTORS  	mod->ctors = section_objs(info, ".ctors",  				  sizeof(*mod->ctors), &mod->num_ctors); +	if (!mod->ctors) +		mod->ctors = section_objs(info, ".init_array", +				sizeof(*mod->ctors), &mod->num_ctors); +	else if (find_sec(info, ".init_array")) { +		/* +		 * This shouldn't happen with same compiler and binutils +		 * building all parts of the module. +		 */ +		printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", +		       mod->name); +		return -EINVAL; +	}  #endif  #ifdef CONFIG_TRACEPOINTS @@ -2801,11 +2773,12 @@ static void find_module_sections(struct module *mod, struct load_info *info)  				    sizeof(*mod->extable), &mod->num_exentries);  	if (section_addr(info, "__obsparm")) -		printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", -		       mod->name); +		pr_warn("%s: Ignoring obsolete parameters\n", mod->name);  	info->debug = section_objs(info, "__verbose",  				   sizeof(*info->debug), &info->num_debug); + +	return 0;  }  static int move_module(struct module *mod, struct load_info *info) @@ -3078,11 +3051,10 @@ static int do_init_module(struct module *mod)  		return ret;  	}  	if (ret > 0) { -		printk(KERN_WARNING -"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" -"%s: loading module anyway...\n", -		       __func__, mod->name, ret, -		       __func__); +		pr_warn("%s: '%s'->init suspiciously returned %d, it should " +			"follow 0/-E convention\n" +			"%s: loading module anyway...\n", +			__func__, mod->name, ret, __func__);  		dump_stack();  	} @@ -3205,10 +3177,8 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname)  {  	/* Check for magic 'dyndbg' arg */   	int ret = ddebug_dyndbg_module_param_cb(param, val, modname); -	if (ret != 0) { -		printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n", -		       modname, param); -	} +	if (ret != 0) +		pr_warn("%s: unknown parameter '%s' ignored\n", modname, param);  	return 0;  } @@ -3243,10 +3213,9 @@ static int load_module(struct load_info *info, const char __user *uargs,  #ifdef CONFIG_MODULE_SIG  	mod->sig_ok = info->sig_ok;  	if (!mod->sig_ok) { -		printk_once(KERN_NOTICE -			    "%s: module verification failed: signature and/or" -			    " required key missing - tainting kernel\n", -			    mod->name); +		pr_notice_once("%s: module verification failed: signature " +			       "and/or  required key missing - tainting " +			       "kernel\n", mod->name);  		add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);  	}  #endif @@ -3263,7 +3232,9 @@ static int load_module(struct load_info *info, const char __user *uargs,  	/* Now we've got everything in the final locations, we can  	 * find optional sections. */ -	find_module_sections(mod, info); +	err = find_module_sections(mod, info); +	if (err) +		goto free_unload;  	err = check_module_license_and_versions(mod);  	if (err) diff --git a/kernel/module_signing.c b/kernel/module_signing.c index f2970bddc5ea..be5b8fac4bd0 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -14,6 +14,7 @@  #include <crypto/public_key.h>  #include <crypto/hash.h>  #include <keys/asymmetric-type.h> +#include <keys/system_keyring.h>  #include "module-internal.h"  /* @@ -28,7 +29,7 @@   */  struct module_signature {  	u8	algo;		/* Public-key crypto algorithm [enum pkey_algo] */ -	u8	hash;		/* Digest algorithm [enum pkey_hash_algo] */ +	u8	hash;		/* Digest algorithm [enum hash_algo] */  	u8	id_type;	/* Key identifier type [enum pkey_id_type] */  	u8	signer_len;	/* Length of signer's name */  	u8	key_id_len;	/* Length of key identifier */ @@ -39,7 +40,7 @@ struct module_signature {  /*   * Digest the module contents.   */ -static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, +static struct public_key_signature *mod_make_digest(enum hash_algo hash,  						    const void *mod,  						    unsigned long modlen)  { @@ -54,7 +55,7 @@ static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash,  	/* Allocate the hashing algorithm we're going to need and find out how  	 * big the hash operational data will be.  	 */ -	tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); +	tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);  	if (IS_ERR(tfm))  		return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); @@ -157,7 +158,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len,  	pr_debug("Look up: \"%s\"\n", id); -	key = keyring_search(make_key_ref(modsign_keyring, 1), +	key = keyring_search(make_key_ref(system_trusted_keyring, 1),  			     &key_type_asymmetric, id);  	if (IS_ERR(key))  		pr_warn("Request for unknown module key '%s' err %ld\n", @@ -217,7 +218,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)  		return -ENOPKG;  	if (ms.hash >= PKEY_HASH__LAST || -	    !pkey_hash_algo[ms.hash]) +	    !hash_algo_name[ms.hash])  		return -ENOPKG;  	key = request_asymmetric_key(sig, ms.signer_len, diff --git a/kernel/padata.c b/kernel/padata.c index 07af2c95dcfe..2abd25d79cc8 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)  static int padata_cpu_hash(struct parallel_data *pd)  { +	unsigned int seq_nr;  	int cpu_index;  	/* @@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd)  	 * seq_nr mod. number of cpus in use.  	 */ -	spin_lock(&pd->seq_lock); -	cpu_index =  pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); -	pd->seq_nr++; -	spin_unlock(&pd->seq_lock); +	seq_nr = atomic_inc_return(&pd->seq_nr); +	cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);  	return padata_index_to_cpu(pd, cpu_index);  } @@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,  	padata_init_pqueues(pd);  	padata_init_squeues(pd);  	setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); -	pd->seq_nr = 0; +	atomic_set(&pd->seq_nr, -1);  	atomic_set(&pd->reorder_objects, 0);  	atomic_set(&pd->refcnt, 0);  	pd->pinst = pinst; diff --git a/kernel/panic.c b/kernel/panic.c index b6c482ccc5db..c00b4ceb39e8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -233,7 +233,7 @@ static const struct tnt tnts[] = {   */  const char *print_tainted(void)  { -	static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; +	static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ")];  	if (tainted_mask) {  		char *s; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 42086551a24a..06c62de9c711 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -132,6 +132,12 @@ out:  	return ERR_PTR(err);  } +static void delayed_free_pidns(struct rcu_head *p) +{ +	kmem_cache_free(pid_ns_cachep, +			container_of(p, struct pid_namespace, rcu)); +} +  static void destroy_pid_namespace(struct pid_namespace *ns)  {  	int i; @@ -140,7 +146,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)  	for (i = 0; i < PIDMAP_ENTRIES; i++)  		kfree(ns->pidmap[i].page);  	put_user_ns(ns->user_ns); -	kmem_cache_free(pid_ns_cachep, ns); +	call_rcu(&ns->rcu, delayed_free_pidns);  }  struct pid_namespace *copy_pid_ns(unsigned long flags, diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index d444c4e834f4..2fac9cc79b3d 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -178,6 +178,22 @@ config PM_SLEEP_DEBUG  	def_bool y  	depends on PM_DEBUG && PM_SLEEP +config DPM_WATCHDOG +	bool "Device suspend/resume watchdog" +	depends on PM_DEBUG && PSTORE +	---help--- +	  Sets up a watchdog timer to capture drivers that are +	  locked up attempting to suspend/resume a device. +	  A detected lockup causes system panic with message +	  captured in pstore device for inspection in subsequent +	  boot session. + +config DPM_WATCHDOG_TIMEOUT +	int "Watchdog timeout in seconds" +	range 1 120 +	default 12 +	depends on DPM_WATCHDOG +  config PM_TRACE  	bool  	help diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c9c759d5a15c..0121dab83f43 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -846,7 +846,7 @@ static int software_resume(void)  	goto Finish;  } -late_initcall(software_resume); +late_initcall_sync(software_resume);  static const char * const hibernation_modes[] = { diff --git a/kernel/power/qos.c b/kernel/power/qos.c index a394297f8b2f..8dff9b48075a 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -558,30 +558,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,  	if (count == sizeof(s32)) {  		if (copy_from_user(&value, buf, sizeof(s32)))  			return -EFAULT; -	} else if (count <= 11) { /* ASCII perhaps? */ -		char ascii_value[11]; -		unsigned long int ulval; +	} else {  		int ret; -		if (copy_from_user(ascii_value, buf, count)) -			return -EFAULT; - -		if (count > 10) { -			if (ascii_value[10] == '\n') -				ascii_value[10] = '\0'; -			else -				return -EINVAL; -		} else { -			ascii_value[count] = '\0'; -		} -		ret = kstrtoul(ascii_value, 16, &ulval); -		if (ret) { -			pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); -			return -EINVAL; -		} -		value = (s32)lower_32_bits(ulval); -	} else { -		return -EINVAL; +		ret = kstrtos32_from_user(buf, count, 16, &value); +		if (ret) +			return ret;  	}  	req = filp->private_data; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 98c3b34a4cff..b38109e204af 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -792,7 +792,8 @@ void free_basic_memory_bitmaps(void)  {  	struct memory_bitmap *bm1, *bm2; -	BUG_ON(!(forbidden_pages_map && free_pages_map)); +	if (WARN_ON(!(forbidden_pages_map && free_pages_map))) +		return;  	bm1 = forbidden_pages_map;  	bm2 = free_pages_map; @@ -1402,7 +1403,11 @@ int hibernate_preallocate_memory(void)  	 * highmem and non-highmem zones separately.  	 */  	pages_highmem = preallocate_image_highmem(highmem / 2); -	alloc = (count - max_size) - pages_highmem; +	alloc = count - max_size; +	if (alloc > pages_highmem) +		alloc -= pages_highmem; +	else +		alloc = 0;  	pages = preallocate_image_memory(alloc, avail_normal);  	if (pages < alloc) {  		/* We have exhausted non-highmem pages, try highmem. */ diff --git a/kernel/power/user.c b/kernel/power/user.c index 957f06164ad1..98d357584cd6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -36,9 +36,9 @@ static struct snapshot_data {  	struct snapshot_handle handle;  	int swap;  	int mode; -	char frozen; -	char ready; -	char platform_support; +	bool frozen; +	bool ready; +	bool platform_support;  	bool free_bitmaps;  } snapshot_state; @@ -70,6 +70,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)  		data->swap = swsusp_resume_device ?  			swap_type_of(swsusp_resume_device, 0, NULL) : -1;  		data->mode = O_RDONLY; +		data->free_bitmaps = false;  		error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);  		if (error)  			pm_notifier_call_chain(PM_POST_HIBERNATION); @@ -93,9 +94,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)  	if (error)  		atomic_inc(&snapshot_device_available); -	data->frozen = 0; -	data->ready = 0; -	data->platform_support = 0; +	data->frozen = false; +	data->ready = false; +	data->platform_support = false;   Unlock:  	unlock_system_sleep(); @@ -229,7 +230,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  		if (error)  			thaw_processes();  		else -			data->frozen = 1; +			data->frozen = true;  		break; @@ -240,7 +241,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  		free_basic_memory_bitmaps();  		data->free_bitmaps = false;  		thaw_processes(); -		data->frozen = 0; +		data->frozen = false;  		break;  	case SNAPSHOT_CREATE_IMAGE: @@ -270,7 +271,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  	case SNAPSHOT_FREE:  		swsusp_free();  		memset(&data->handle, 0, sizeof(struct snapshot_handle)); -		data->ready = 0; +		data->ready = false;  		/*  		 * It is necessary to thaw kernel threads here, because  		 * SNAPSHOT_CREATE_IMAGE may be invoked directly after @@ -334,7 +335,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  		 * PM_HIBERNATION_PREPARE  		 */  		error = suspend_devices_and_enter(PM_SUSPEND_MEM); -		data->ready = 0; +		data->ready = false;  		break;  	case SNAPSHOT_PLATFORM_SUPPORT: diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b4e8500afdb3..be7c86bae576 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -705,9 +705,9 @@ const struct file_operations kmsg_fops = {  #ifdef CONFIG_KEXEC  /* - * This appends the listed symbols to /proc/vmcoreinfo + * This appends the listed symbols to /proc/vmcore   * - * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to + * /proc/vmcore is used by various utilities, like crash and makedumpfile to   * obtain access to symbols that are otherwise very difficult to locate.  These   * symbols are specifically used so that utilities can access and extract the   * dmesg log from a vmcore file after a crash. @@ -791,7 +791,7 @@ static bool __read_mostly ignore_loglevel;  static int __init ignore_loglevel_setup(char *str)  {  	ignore_loglevel = 1; -	printk(KERN_INFO "debug: ignoring loglevel setting.\n"); +	pr_info("debug: ignoring loglevel setting.\n");  	return 0;  } @@ -820,9 +820,9 @@ static int __init boot_delay_setup(char *str)  	pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "  		"HZ: %d, loops_per_msec: %llu\n",  		boot_delay, preset_lpj, lpj, HZ, loops_per_msec); -	return 1; +	return 0;  } -__setup("boot_delay=", boot_delay_setup); +early_param("boot_delay", boot_delay_setup);  static void boot_delay_msec(int level)  { @@ -2193,7 +2193,7 @@ static int __read_mostly keep_bootcon;  static int __init keep_bootcon_setup(char *str)  {  	keep_bootcon = 1; -	printk(KERN_INFO "debug: skip boot console de-registration.\n"); +	pr_info("debug: skip boot console de-registration.\n");  	return 0;  } @@ -2241,7 +2241,7 @@ void register_console(struct console *newcon)  		/* find the last or real console */  		for_each_console(bcon) {  			if (!(bcon->flags & CON_BOOT)) { -				printk(KERN_INFO "Too late to register bootconsole %s%d\n", +				pr_info("Too late to register bootconsole %s%d\n",  					newcon->name, newcon->index);  				return;  			} @@ -2358,21 +2358,18 @@ void register_console(struct console *newcon)  	 * users know there might be something in the kernel's log buffer that  	 * went to the bootconsole (that they do not see on the real console)  	 */ +	pr_info("%sconsole [%s%d] enabled\n", +		(newcon->flags & CON_BOOT) ? "boot" : "" , +		newcon->name, newcon->index);  	if (bcon &&  	    ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&  	    !keep_bootcon) { -		/* we need to iterate through twice, to make sure we print -		 * everything out, before we unregister the console(s) +		/* We need to iterate through all boot consoles, to make +		 * sure we print everything out, before we unregister them.  		 */ -		printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n", -			newcon->name, newcon->index);  		for_each_console(bcon)  			if (bcon->flags & CON_BOOT)  				unregister_console(bcon); -	} else { -		printk(KERN_INFO "%sconsole [%s%d] enabled\n", -			(newcon->flags & CON_BOOT) ? "boot" : "" , -			newcon->name, newcon->index);  	}  }  EXPORT_SYMBOL(register_console); @@ -2382,6 +2379,10 @@ int unregister_console(struct console *console)          struct console *a, *b;  	int res; +	pr_info("%sconsole [%s%d] disabled\n", +		(console->flags & CON_BOOT) ? "boot" : "" , +		console->name, console->index); +  	res = _braille_unregister_console(console);  	if (res)  		return res; @@ -2421,8 +2422,6 @@ static int __init printk_late_init(void)  	for_each_console(con) {  		if (!keep_bootcon && con->flags & CON_BOOT) { -			printk(KERN_INFO "turn off boot console %s%d\n", -				con->name, con->index);  			unregister_console(con);  		}  	} @@ -2449,7 +2448,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)  	if (pending & PRINTK_PENDING_SCHED) {  		char *buf = __get_cpu_var(printk_sched_buf); -		printk(KERN_WARNING "[sched_delayed] %s", buf); +		pr_warn("[sched_delayed] %s", buf);  	}  	if (pending & PRINTK_PENDING_WAKEUP) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index dd562e9aa2c8..1f4bcb3cc21c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -257,7 +257,8 @@ ok:  	if (task->mm)  		dumpable = get_dumpable(task->mm);  	rcu_read_lock(); -	if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) { +	if (dumpable != SUID_DUMP_USER && +	    !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {  		rcu_read_unlock();  		return -EPERM;  	} diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile new file mode 100644 index 000000000000..01e9ec37a3e3 --- /dev/null +++ b/kernel/rcu/Makefile @@ -0,0 +1,6 @@ +obj-y += update.o srcu.o +obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o +obj-$(CONFIG_TREE_RCU) += tree.o +obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o +obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o +obj-$(CONFIG_TINY_RCU) += tiny.o diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h index 77131966c4ad..7859a0a3951e 100644 --- a/kernel/rcu.h +++ b/kernel/rcu/rcu.h @@ -122,4 +122,11 @@ int rcu_jiffies_till_stall_check(void);  #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ +/* + * Strings used in tracepoints need to be exported via the + * tracing system such that tools like perf and trace-cmd can + * translate the string address pointers to actual text. + */ +#define TPS(x)  tracepoint_string(x) +  #endif /* __LINUX_RCU_H */ diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c index 01d5ccb8bfe3..01d5ccb8bfe3 100644 --- a/kernel/srcu.c +++ b/kernel/rcu/srcu.c diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c index 9ed6075dc562..1254f312d024 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcu/tiny.c @@ -35,6 +35,7 @@  #include <linux/time.h>  #include <linux/cpu.h>  #include <linux/prefetch.h> +#include <linux/ftrace_event.h>  #ifdef CONFIG_RCU_TRACE  #include <trace/events/rcu.h> @@ -42,7 +43,7 @@  #include "rcu.h" -/* Forward declarations for rcutiny_plugin.h. */ +/* Forward declarations for tiny_plugin.h. */  struct rcu_ctrlblk;  static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);  static void rcu_process_callbacks(struct softirq_action *unused); @@ -52,22 +53,23 @@ static void __call_rcu(struct rcu_head *head,  static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; -#include "rcutiny_plugin.h" +#include "tiny_plugin.h"  /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */  static void rcu_idle_enter_common(long long newval)  {  	if (newval) { -		RCU_TRACE(trace_rcu_dyntick("--=", +		RCU_TRACE(trace_rcu_dyntick(TPS("--="),  					    rcu_dynticks_nesting, newval));  		rcu_dynticks_nesting = newval;  		return;  	} -	RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); +	RCU_TRACE(trace_rcu_dyntick(TPS("Start"), +				    rcu_dynticks_nesting, newval));  	if (!is_idle_task(current)) { -		struct task_struct *idle = idle_task(smp_processor_id()); +		struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); -		RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", +		RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),  					    rcu_dynticks_nesting, newval));  		ftrace_dump(DUMP_ALL);  		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit);  static void rcu_idle_exit_common(long long oldval)  {  	if (oldval) { -		RCU_TRACE(trace_rcu_dyntick("++=", +		RCU_TRACE(trace_rcu_dyntick(TPS("++="),  					    oldval, rcu_dynticks_nesting));  		return;  	} -	RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); +	RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));  	if (!is_idle_task(current)) { -		struct task_struct *idle = idle_task(smp_processor_id()); +		struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); -		RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", +		RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),  			  oldval, rcu_dynticks_nesting));  		ftrace_dump(DUMP_ALL);  		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -174,18 +176,18 @@ void rcu_irq_enter(void)  }  EXPORT_SYMBOL_GPL(rcu_irq_enter); -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)  /*   * Test whether RCU thinks that the current CPU is idle.   */ -int rcu_is_cpu_idle(void) +bool notrace __rcu_is_watching(void)  { -	return !rcu_dynticks_nesting; +	return rcu_dynticks_nesting;  } -EXPORT_SYMBOL(rcu_is_cpu_idle); +EXPORT_SYMBOL(__rcu_is_watching); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */  /*   * Test whether the current CPU was interrupted from idle.  Nested @@ -273,7 +275,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)  	if (&rcp->rcucblist == rcp->donetail) {  		RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));  		RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, -					      ACCESS_ONCE(rcp->rcucblist), +					      !!ACCESS_ONCE(rcp->rcucblist),  					      need_resched(),  					      is_idle_task(current),  					      false)); @@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)  		RCU_TRACE(cb_count++);  	}  	RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); -	RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), +	RCU_TRACE(trace_rcu_batch_end(rcp->name, +				      cb_count, 0, need_resched(),  				      is_idle_task(current),  				      false));  } diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h index 280d06cae352..280d06cae352 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c index be63101c6175..3929cd451511 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcu/torture.c @@ -52,6 +52,12 @@  MODULE_LICENSE("GPL");  MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); +MODULE_ALIAS("rcutorture"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcutorture." +  static int fqs_duration;  module_param(fqs_duration, int, 0444);  MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c index 32618b3fe4e6..dd081987a8ec 100644 --- a/kernel/rcutree.c +++ b/kernel/rcu/tree.c @@ -41,6 +41,7 @@  #include <linux/export.h>  #include <linux/completion.h>  #include <linux/moduleparam.h> +#include <linux/module.h>  #include <linux/percpu.h>  #include <linux/notifier.h>  #include <linux/cpu.h> @@ -56,17 +57,16 @@  #include <linux/ftrace_event.h>  #include <linux/suspend.h> -#include "rcutree.h" +#include "tree.h"  #include <trace/events/rcu.h>  #include "rcu.h" -/* - * Strings used in tracepoints need to be exported via the - * tracing system such that tools like perf and trace-cmd can - * translate the string address pointers to actual text. - */ -#define TPS(x)	tracepoint_string(x) +MODULE_ALIAS("rcutree"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcutree."  /* Data structures. */ @@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu)  }  EXPORT_SYMBOL_GPL(rcu_note_context_switch); -DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { +static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {  	.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,  	.dynticks = ATOMIC_INIT(1),  #ifdef CONFIG_NO_HZ_FULL_SYSIDLE @@ -371,7 +371,8 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,  {  	trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);  	if (!user && !is_idle_task(current)) { -		struct task_struct *idle = idle_task(smp_processor_id()); +		struct task_struct *idle __maybe_unused = +			idle_task(smp_processor_id());  		trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);  		ftrace_dump(DUMP_ORIG); @@ -407,7 +408,7 @@ static void rcu_eqs_enter(bool user)  	long long oldval;  	struct rcu_dynticks *rdtp; -	rdtp = &__get_cpu_var(rcu_dynticks); +	rdtp = this_cpu_ptr(&rcu_dynticks);  	oldval = rdtp->dynticks_nesting;  	WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);  	if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) @@ -435,7 +436,7 @@ void rcu_idle_enter(void)  	local_irq_save(flags);  	rcu_eqs_enter(false); -	rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); +	rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);  	local_irq_restore(flags);  }  EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -478,7 +479,7 @@ void rcu_irq_exit(void)  	struct rcu_dynticks *rdtp;  	local_irq_save(flags); -	rdtp = &__get_cpu_var(rcu_dynticks); +	rdtp = this_cpu_ptr(&rcu_dynticks);  	oldval = rdtp->dynticks_nesting;  	rdtp->dynticks_nesting--;  	WARN_ON_ONCE(rdtp->dynticks_nesting < 0); @@ -508,7 +509,8 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,  	rcu_cleanup_after_idle(smp_processor_id());  	trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);  	if (!user && !is_idle_task(current)) { -		struct task_struct *idle = idle_task(smp_processor_id()); +		struct task_struct *idle __maybe_unused = +			idle_task(smp_processor_id());  		trace_rcu_dyntick(TPS("Error on exit: not idle task"),  				  oldval, rdtp->dynticks_nesting); @@ -528,7 +530,7 @@ static void rcu_eqs_exit(bool user)  	struct rcu_dynticks *rdtp;  	long long oldval; -	rdtp = &__get_cpu_var(rcu_dynticks); +	rdtp = this_cpu_ptr(&rcu_dynticks);  	oldval = rdtp->dynticks_nesting;  	WARN_ON_ONCE(oldval < 0);  	if (oldval & DYNTICK_TASK_NEST_MASK) @@ -555,7 +557,7 @@ void rcu_idle_exit(void)  	local_irq_save(flags);  	rcu_eqs_exit(false); -	rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); +	rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);  	local_irq_restore(flags);  }  EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -599,7 +601,7 @@ void rcu_irq_enter(void)  	long long oldval;  	local_irq_save(flags); -	rdtp = &__get_cpu_var(rcu_dynticks); +	rdtp = this_cpu_ptr(&rcu_dynticks);  	oldval = rdtp->dynticks_nesting;  	rdtp->dynticks_nesting++;  	WARN_ON_ONCE(rdtp->dynticks_nesting == 0); @@ -620,7 +622,7 @@ void rcu_irq_enter(void)   */  void rcu_nmi_enter(void)  { -	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); +	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);  	if (rdtp->dynticks_nmi_nesting == 0 &&  	    (atomic_read(&rdtp->dynticks) & 0x1)) @@ -642,7 +644,7 @@ void rcu_nmi_enter(void)   */  void rcu_nmi_exit(void)  { -	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); +	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);  	if (rdtp->dynticks_nmi_nesting == 0 ||  	    --rdtp->dynticks_nmi_nesting != 0) @@ -655,21 +657,34 @@ void rcu_nmi_exit(void)  }  /** - * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle + * __rcu_is_watching - are RCU read-side critical sections safe? + * + * Return true if RCU is watching the running CPU, which means that + * this CPU can safely enter RCU read-side critical sections.  Unlike + * rcu_is_watching(), the caller of __rcu_is_watching() must have at + * least disabled preemption. + */ +bool notrace __rcu_is_watching(void) +{ +	return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; +} + +/** + * rcu_is_watching - see if RCU thinks that the current CPU is idle   *   * If the current CPU is in its idle loop and is neither in an interrupt   * or NMI handler, return true.   */ -int rcu_is_cpu_idle(void) +bool notrace rcu_is_watching(void)  {  	int ret;  	preempt_disable(); -	ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; +	ret = __rcu_is_watching();  	preempt_enable();  	return ret;  } -EXPORT_SYMBOL(rcu_is_cpu_idle); +EXPORT_SYMBOL_GPL(rcu_is_watching);  #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) @@ -703,7 +718,7 @@ bool rcu_lockdep_current_cpu_online(void)  	if (in_nmi())  		return 1;  	preempt_disable(); -	rdp = &__get_cpu_var(rcu_sched_data); +	rdp = this_cpu_ptr(&rcu_sched_data);  	rnp = rdp->mynode;  	ret = (rdp->grpmask & rnp->qsmaskinit) ||  	      !rcu_scheduler_fully_active; @@ -723,7 +738,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);   */  static int rcu_is_cpu_rrupt_from_idle(void)  { -	return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; +	return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;  }  /* @@ -802,8 +817,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,  static void record_gp_stall_check_time(struct rcu_state *rsp)  { -	rsp->gp_start = jiffies; -	rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); +	unsigned long j = ACCESS_ONCE(jiffies); + +	rsp->gp_start = j; +	smp_wmb(); /* Record start time before stall time. */ +	rsp->jiffies_stall = j + rcu_jiffies_till_stall_check();  }  /* @@ -898,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  	force_quiescent_state(rsp);  /* Kick them all. */  } +/* + * This function really isn't for public consumption, but RCU is special in + * that context switches can allow the state machine to make progress. + */ +extern void resched_cpu(int cpu); +  static void print_cpu_stall(struct rcu_state *rsp)  {  	int cpu; @@ -927,22 +951,60 @@ static void print_cpu_stall(struct rcu_state *rsp)  				     3 * rcu_jiffies_till_stall_check() + 3;  	raw_spin_unlock_irqrestore(&rnp->lock, flags); -	set_need_resched();  /* kick ourselves to get things going. */ +	/* +	 * Attempt to revive the RCU machinery by forcing a context switch. +	 * +	 * A context switch would normally allow the RCU state machine to make +	 * progress and it could be we're stuck in kernel space without context +	 * switches for an entirely unreasonable amount of time. +	 */ +	resched_cpu(smp_processor_id());  }  static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)  { +	unsigned long completed; +	unsigned long gpnum; +	unsigned long gps;  	unsigned long j;  	unsigned long js;  	struct rcu_node *rnp; -	if (rcu_cpu_stall_suppress) +	if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))  		return;  	j = ACCESS_ONCE(jiffies); + +	/* +	 * Lots of memory barriers to reject false positives. +	 * +	 * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, +	 * then rsp->gp_start, and finally rsp->completed.  These values +	 * are updated in the opposite order with memory barriers (or +	 * equivalent) during grace-period initialization and cleanup. +	 * Now, a false positive can occur if we get an new value of +	 * rsp->gp_start and a old value of rsp->jiffies_stall.  But given +	 * the memory barriers, the only way that this can happen is if one +	 * grace period ends and another starts between these two fetches. +	 * Detect this by comparing rsp->completed with the previous fetch +	 * from rsp->gpnum. +	 * +	 * Given this check, comparisons of jiffies, rsp->jiffies_stall, +	 * and rsp->gp_start suffice to forestall false positives. +	 */ +	gpnum = ACCESS_ONCE(rsp->gpnum); +	smp_rmb(); /* Pick up ->gpnum first... */  	js = ACCESS_ONCE(rsp->jiffies_stall); +	smp_rmb(); /* ...then ->jiffies_stall before the rest... */ +	gps = ACCESS_ONCE(rsp->gp_start); +	smp_rmb(); /* ...and finally ->gp_start before ->completed. */ +	completed = ACCESS_ONCE(rsp->completed); +	if (ULONG_CMP_GE(completed, gpnum) || +	    ULONG_CMP_LT(j, js) || +	    ULONG_CMP_GE(gps, js)) +		return; /* No stall or GP completed since entering function. */  	rnp = rdp->mynode;  	if (rcu_gp_in_progress(rsp) && -	    (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { +	    (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {  		/* We haven't checked in, so go dump stack. */  		print_cpu_stall(rsp); @@ -1297,7 +1359,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)  }  /* - * Initialize a new grace period. + * Initialize a new grace period.  Return 0 if no grace period required.   */  static int rcu_gp_init(struct rcu_state *rsp)  { @@ -1306,18 +1368,27 @@ static int rcu_gp_init(struct rcu_state *rsp)  	rcu_bind_gp_kthread();  	raw_spin_lock_irq(&rnp->lock); +	if (rsp->gp_flags == 0) { +		/* Spurious wakeup, tell caller to go back to sleep.  */ +		raw_spin_unlock_irq(&rnp->lock); +		return 0; +	}  	rsp->gp_flags = 0; /* Clear all flags: New grace period. */ -	if (rcu_gp_in_progress(rsp)) { -		/* Grace period already in progress, don't start another.  */ +	if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { +		/* +		 * Grace period already in progress, don't start another. +		 * Not supposed to be able to happen. +		 */  		raw_spin_unlock_irq(&rnp->lock);  		return 0;  	}  	/* Advance to a new grace period and initialize state. */ +	record_gp_stall_check_time(rsp); +	smp_wmb(); /* Record GP times before starting GP. */  	rsp->gpnum++;  	trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); -	record_gp_stall_check_time(rsp);  	raw_spin_unlock_irq(&rnp->lock);  	/* Exclude any concurrent CPU-hotplug operations. */ @@ -1366,7 +1437,7 @@ static int rcu_gp_init(struct rcu_state *rsp)  /*   * Do one round of quiescent-state forcing.   */ -int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) +static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)  {  	int fqs_state = fqs_state_in;  	bool isidle = false; @@ -1451,8 +1522,12 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)  	rsp->fqs_state = RCU_GP_IDLE;  	rdp = this_cpu_ptr(rsp->rda);  	rcu_advance_cbs(rsp, rnp, rdp);  /* Reduce false positives below. */ -	if (cpu_needs_another_gp(rsp, rdp)) -		rsp->gp_flags = 1; +	if (cpu_needs_another_gp(rsp, rdp)) { +		rsp->gp_flags = RCU_GP_FLAG_INIT; +		trace_rcu_grace_period(rsp->name, +				       ACCESS_ONCE(rsp->gpnum), +				       TPS("newreq")); +	}  	raw_spin_unlock_irq(&rnp->lock);  } @@ -1462,6 +1537,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)  static int __noreturn rcu_gp_kthread(void *arg)  {  	int fqs_state; +	int gf;  	unsigned long j;  	int ret;  	struct rcu_state *rsp = arg; @@ -1471,14 +1547,19 @@ static int __noreturn rcu_gp_kthread(void *arg)  		/* Handle grace-period start. */  		for (;;) { +			trace_rcu_grace_period(rsp->name, +					       ACCESS_ONCE(rsp->gpnum), +					       TPS("reqwait"));  			wait_event_interruptible(rsp->gp_wq, -						 rsp->gp_flags & +						 ACCESS_ONCE(rsp->gp_flags) &  						 RCU_GP_FLAG_INIT); -			if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && -			    rcu_gp_init(rsp)) +			if (rcu_gp_init(rsp))  				break;  			cond_resched();  			flush_signals(current); +			trace_rcu_grace_period(rsp->name, +					       ACCESS_ONCE(rsp->gpnum), +					       TPS("reqwaitsig"));  		}  		/* Handle quiescent-state forcing. */ @@ -1488,10 +1569,16 @@ static int __noreturn rcu_gp_kthread(void *arg)  			j = HZ;  			jiffies_till_first_fqs = HZ;  		} +		ret = 0;  		for (;;) { -			rsp->jiffies_force_qs = jiffies + j; +			if (!ret) +				rsp->jiffies_force_qs = jiffies + j; +			trace_rcu_grace_period(rsp->name, +					       ACCESS_ONCE(rsp->gpnum), +					       TPS("fqswait"));  			ret = wait_event_interruptible_timeout(rsp->gp_wq, -					(rsp->gp_flags & RCU_GP_FLAG_FQS) || +					((gf = ACCESS_ONCE(rsp->gp_flags)) & +					 RCU_GP_FLAG_FQS) ||  					(!ACCESS_ONCE(rnp->qsmask) &&  					 !rcu_preempt_blocked_readers_cgp(rnp)),  					j); @@ -1500,13 +1587,23 @@ static int __noreturn rcu_gp_kthread(void *arg)  			    !rcu_preempt_blocked_readers_cgp(rnp))  				break;  			/* If time for quiescent-state forcing, do it. */ -			if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { +			if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || +			    (gf & RCU_GP_FLAG_FQS)) { +				trace_rcu_grace_period(rsp->name, +						       ACCESS_ONCE(rsp->gpnum), +						       TPS("fqsstart"));  				fqs_state = rcu_gp_fqs(rsp, fqs_state); +				trace_rcu_grace_period(rsp->name, +						       ACCESS_ONCE(rsp->gpnum), +						       TPS("fqsend"));  				cond_resched();  			} else {  				/* Deal with stray signal. */  				cond_resched();  				flush_signals(current); +				trace_rcu_grace_period(rsp->name, +						       ACCESS_ONCE(rsp->gpnum), +						       TPS("fqswaitsig"));  			}  			j = jiffies_till_next_fqs;  			if (j > HZ) { @@ -1554,6 +1651,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,  		return;  	}  	rsp->gp_flags = RCU_GP_FLAG_INIT; +	trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), +			       TPS("newreq"));  	/*  	 * We can't do wakeups while holding the rnp->lock, as that @@ -2255,7 +2354,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,  	 * If called from an extended quiescent state, invoke the RCU  	 * core in order to force a re-evaluation of RCU's idleness.  	 */ -	if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) +	if (!rcu_is_watching() && cpu_online(smp_processor_id()))  		invoke_rcu_core();  	/* If interrupts were disabled or CPU offline, don't invoke RCU core. */ @@ -2725,10 +2824,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)  	for_each_rcu_flavor(rsp) {  		rdp = per_cpu_ptr(rsp->rda, cpu); -		if (rdp->qlen != rdp->qlen_lazy) +		if (!rdp->nxtlist) +			continue; +		hc = true; +		if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {  			al = false; -		if (rdp->nxtlist) -			hc = true; +			break; +		}  	}  	if (all_lazy)  		*all_lazy = al; @@ -3216,7 +3318,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,  /*   * Compute the rcu_node tree geometry from kernel parameters.  This cannot - * replace the definitions in rcutree.h because those are needed to size + * replace the definitions in tree.h because those are needed to size   * the ->node array in the rcu_state structure.   */  static void __init rcu_init_geometry(void) @@ -3295,8 +3397,8 @@ void __init rcu_init(void)  	rcu_bootup_announce();  	rcu_init_geometry(); -	rcu_init_one(&rcu_sched_state, &rcu_sched_data);  	rcu_init_one(&rcu_bh_state, &rcu_bh_data); +	rcu_init_one(&rcu_sched_state, &rcu_sched_data);  	__rcu_init_preempt();  	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); @@ -3311,4 +3413,4 @@ void __init rcu_init(void)  		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);  } -#include "rcutree_plugin.h" +#include "tree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h index 5f97eab602cd..52be957c9fe2 100644 --- a/kernel/rcutree.h +++ b/kernel/rcu/tree.h @@ -104,6 +104,8 @@ struct rcu_dynticks {  				    /* idle-period nonlazy_posted snapshot. */  	unsigned long last_accelerate;  				    /* Last jiffy CBs were accelerated. */ +	unsigned long last_advance_all; +				    /* Last jiffy CBs were all advanced. */  	int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */  #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */  }; diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h index 130c97b027f2..6abb03dff5c0 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -28,7 +28,7 @@  #include <linux/gfp.h>  #include <linux/oom.h>  #include <linux/smpboot.h> -#include "time/tick-internal.h" +#include "../time/tick-internal.h"  #define RCU_KTHREAD_PRIO 1 @@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void)  #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */  #ifdef CONFIG_RCU_NOCB_CPU_ALL  	pr_info("\tOffload RCU callbacks from all CPUs\n"); -	cpumask_setall(rcu_nocb_mask); +	cpumask_copy(rcu_nocb_mask, cpu_possible_mask);  #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */  #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */  	if (have_rcu_nocb_mask) { +		if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { +			pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); +			cpumask_and(rcu_nocb_mask, cpu_possible_mask, +				    rcu_nocb_mask); +		}  		cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);  		pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);  		if (rcu_nocb_poll) @@ -660,7 +665,7 @@ static void rcu_preempt_check_callbacks(int cpu)  static void rcu_preempt_do_callbacks(void)  { -	rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); +	rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));  }  #endif /* #ifdef CONFIG_RCU_BOOST */ @@ -1128,7 +1133,7 @@ void exit_rcu(void)  #ifdef CONFIG_RCU_BOOST -#include "rtmutex_common.h" +#include "../locking/rtmutex_common.h"  #ifdef CONFIG_RCU_TRACE @@ -1332,7 +1337,7 @@ static void invoke_rcu_callbacks_kthread(void)   */  static bool rcu_is_callbacks_kthread(void)  { -	return __get_cpu_var(rcu_cpu_kthread_task) == current; +	return __this_cpu_read(rcu_cpu_kthread_task) == current;  }  #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) @@ -1382,8 +1387,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,  static void rcu_kthread_do_work(void)  { -	rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); -	rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); +	rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); +	rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));  	rcu_preempt_do_callbacks();  } @@ -1402,7 +1407,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu)  static int rcu_cpu_kthread_should_run(unsigned int cpu)  { -	return __get_cpu_var(rcu_cpu_has_work); +	return __this_cpu_read(rcu_cpu_has_work);  }  /* @@ -1412,8 +1417,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu)   */  static void rcu_cpu_kthread(unsigned int cpu)  { -	unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); -	char work, *workp = &__get_cpu_var(rcu_cpu_has_work); +	unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); +	char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);  	int spincnt;  	for (spincnt = 0; spincnt < 10; spincnt++) { @@ -1630,17 +1635,23 @@ module_param(rcu_idle_lazy_gp_delay, int, 0644);  extern int tick_nohz_enabled;  /* - * Try to advance callbacks for all flavors of RCU on the current CPU. - * Afterwards, if there are any callbacks ready for immediate invocation, - * return true. + * Try to advance callbacks for all flavors of RCU on the current CPU, but + * only if it has been awhile since the last time we did so.  Afterwards, + * if there are any callbacks ready for immediate invocation, return true.   */  static bool rcu_try_advance_all_cbs(void)  {  	bool cbs_ready = false;  	struct rcu_data *rdp; +	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);  	struct rcu_node *rnp;  	struct rcu_state *rsp; +	/* Exit early if we advanced recently. */ +	if (jiffies == rdtp->last_advance_all) +		return 0; +	rdtp->last_advance_all = jiffies; +  	for_each_rcu_flavor(rsp) {  		rdp = this_cpu_ptr(rsp->rda);  		rnp = rdp->mynode; @@ -1739,6 +1750,8 @@ static void rcu_prepare_for_idle(int cpu)  	 */  	if (rdtp->all_lazy &&  	    rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { +		rdtp->all_lazy = false; +		rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;  		invoke_rcu_core();  		return;  	} @@ -1768,17 +1781,11 @@ static void rcu_prepare_for_idle(int cpu)   */  static void rcu_cleanup_after_idle(int cpu)  { -	struct rcu_data *rdp; -	struct rcu_state *rsp;  	if (rcu_is_nocb_cpu(cpu))  		return; -	rcu_try_advance_all_cbs(); -	for_each_rcu_flavor(rsp) { -		rdp = per_cpu_ptr(rsp->rda, cpu); -		if (cpu_has_callbacks_ready_to_invoke(rdp)) -			invoke_rcu_core(); -	} +	if (rcu_try_advance_all_cbs()) +		invoke_rcu_core();  }  /* @@ -2108,15 +2115,22 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,  	/* If we are not being polled and there is a kthread, awaken it ... */  	t = ACCESS_ONCE(rdp->nocb_kthread); -	if (rcu_nocb_poll | !t) +	if (rcu_nocb_poll || !t) { +		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, +				    TPS("WakeNotPoll"));  		return; +	}  	len = atomic_long_read(&rdp->nocb_q_count);  	if (old_rhpp == &rdp->nocb_head) {  		wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */  		rdp->qlen_last_fqs_check = 0; +		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));  	} else if (len > rdp->qlen_last_fqs_check + qhimark) {  		wake_up_process(t); /* ... or if many callbacks queued. */  		rdp->qlen_last_fqs_check = LONG_MAX / 2; +		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); +	} else { +		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));  	}  	return;  } @@ -2140,10 +2154,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,  	if (__is_kfree_rcu_offset((unsigned long)rhp->func))  		trace_rcu_kfree_callback(rdp->rsp->name, rhp,  					 (unsigned long)rhp->func, -					 rdp->qlen_lazy, rdp->qlen); +					 -atomic_long_read(&rdp->nocb_q_count_lazy), +					 -atomic_long_read(&rdp->nocb_q_count));  	else  		trace_rcu_callback(rdp->rsp->name, rhp, -				   rdp->qlen_lazy, rdp->qlen); +				   -atomic_long_read(&rdp->nocb_q_count_lazy), +				   -atomic_long_read(&rdp->nocb_q_count));  	return 1;  } @@ -2221,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)  static int rcu_nocb_kthread(void *arg)  {  	int c, cl; +	bool firsttime = 1;  	struct rcu_head *list;  	struct rcu_head *next;  	struct rcu_head **tail; @@ -2229,14 +2246,27 @@ static int rcu_nocb_kthread(void *arg)  	/* Each pass through this loop invokes one batch of callbacks */  	for (;;) {  		/* If not polling, wait for next batch of callbacks. */ -		if (!rcu_nocb_poll) +		if (!rcu_nocb_poll) { +			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, +					    TPS("Sleep"));  			wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); +		} else if (firsttime) { +			firsttime = 0; +			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, +					    TPS("Poll")); +		}  		list = ACCESS_ONCE(rdp->nocb_head);  		if (!list) { +			if (!rcu_nocb_poll) +				trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, +						    TPS("WokeEmpty"));  			schedule_timeout_interruptible(1);  			flush_signals(current);  			continue;  		} +		firsttime = 1; +		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, +				    TPS("WokeNonEmpty"));  		/*  		 * Extract queued callbacks, update counts, and wait @@ -2257,7 +2287,11 @@ static int rcu_nocb_kthread(void *arg)  			next = list->next;  			/* Wait for enqueuing to complete, if needed. */  			while (next == NULL && &list->next != tail) { +				trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, +						    TPS("WaitQueue"));  				schedule_timeout_interruptible(1); +				trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, +						    TPS("WokeQueue"));  				next = list->next;  			}  			debug_rcu_head_unqueue(list); diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c index cf6c17412932..3596797b7e46 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -44,7 +44,7 @@  #include <linux/seq_file.h>  #define RCU_TREE_NONCORE -#include "rcutree.h" +#include "tree.h"  static int r_open(struct inode *inode, struct file *file,  					const struct seq_operations *op) diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c index b02a339836b4..6cb3dff89e2b 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcu/update.c @@ -53,6 +53,12 @@  #include "rcu.h" +MODULE_ALIAS("rcupdate"); +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "rcupdate." +  module_param(rcu_expedited, int, 0);  #ifdef CONFIG_PREEMPT_RCU @@ -148,7 +154,7 @@ int rcu_read_lock_bh_held(void)  {  	if (!debug_lockdep_rcu_enabled())  		return 1; -	if (rcu_is_cpu_idle()) +	if (!rcu_is_watching())  		return 0;  	if (!rcu_lockdep_current_cpu_online())  		return 0; @@ -298,7 +304,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);  #endif  int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ -int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; +static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;  module_param(rcu_cpu_stall_suppress, int, 0644);  module_param(rcu_cpu_stall_timeout, int, 0644); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 54adcf35f495..7b621409cf15 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer  endif  obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o +obj-y += wait.o completion.o  obj-$(CONFIG_SMP) += cpupri.o  obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o  obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c new file mode 100644 index 000000000000..a63f4dc27909 --- /dev/null +++ b/kernel/sched/completion.c @@ -0,0 +1,299 @@ +/* + * Generic wait-for-completion handler; + * + * It differs from semaphores in that their default case is the opposite, + * wait_for_completion default blocks whereas semaphore default non-block. The + * interface also makes it easy to 'complete' multiple waiting threads, + * something which isn't entirely natural for semaphores. + * + * But more importantly, the primitive documents the usage. Semaphores would + * typically be used for exclusion which gives rise to priority inversion. + * Waiting for completion is a typically sync point, but not an exclusion point. + */ + +#include <linux/sched.h> +#include <linux/completion.h> + +/** + * complete: - signals a single thread waiting on this completion + * @x:  holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete(struct completion *x) +{ +	unsigned long flags; + +	spin_lock_irqsave(&x->wait.lock, flags); +	x->done++; +	__wake_up_locked(&x->wait, TASK_NORMAL, 1); +	spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x:  holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete_all(struct completion *x) +{ +	unsigned long flags; + +	spin_lock_irqsave(&x->wait.lock, flags); +	x->done += UINT_MAX/2; +	__wake_up_locked(&x->wait, TASK_NORMAL, 0); +	spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, +		   long (*action)(long), long timeout, int state) +{ +	if (!x->done) { +		DECLARE_WAITQUEUE(wait, current); + +		__add_wait_queue_tail_exclusive(&x->wait, &wait); +		do { +			if (signal_pending_state(state, current)) { +				timeout = -ERESTARTSYS; +				break; +			} +			__set_current_state(state); +			spin_unlock_irq(&x->wait.lock); +			timeout = action(timeout); +			spin_lock_irq(&x->wait.lock); +		} while (!x->done && timeout); +		__remove_wait_queue(&x->wait, &wait); +		if (!x->done) +			return timeout; +	} +	x->done--; +	return timeout ?: 1; +} + +static inline long __sched +__wait_for_common(struct completion *x, +		  long (*action)(long), long timeout, int state) +{ +	might_sleep(); + +	spin_lock_irq(&x->wait.lock); +	timeout = do_wait_for_common(x, action, timeout, state); +	spin_unlock_irq(&x->wait.lock); +	return timeout; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ +	return __wait_for_common(x, schedule_timeout, timeout, state); +} + +static long __sched +wait_for_common_io(struct completion *x, long timeout, int state) +{ +	return __wait_for_common(x, io_schedule_timeout, timeout, state); +} + +/** + * wait_for_completion: - waits for completion of a task + * @x:  holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ +	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_io: - waits for completion of a task + * @x:  holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. The caller is accounted as waiting + * for IO. + */ +void __sched wait_for_completion_io(struct completion *x) +{ +	wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io); + +/** + * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. The caller is accounted as waiting for IO. + * + * Return: 0 if timed out, and positive (at least 1, or number of jiffies left + * till timeout) if completed. + */ +unsigned long __sched +wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) +{ +	return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_io_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x:  holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ +	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); +	if (t == -ERESTARTSYS) +		return t; +	return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_interruptible_timeout(struct completion *x, +					  unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x:  holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + * + * Return: -ERESTARTSYS if interrupted, 0 if completed. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ +	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); +	if (t == -ERESTARTSYS) +		return t; +	return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x:  holds the state of this particular completion + * @timeout:  timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + * + * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, + * or number of jiffies left till timeout) if completed. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, +				     unsigned long timeout) +{ +	return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** + *	try_wait_for_completion - try to decrement a completion without blocking + *	@x:	completion structure + * + *	Return: 0 if a decrement cannot be done without blocking + *		 1 if a decrement succeeded. + * + *	If a completion is being used as a counting completion, + *	attempt to decrement the counter without blocking. This + *	enables us to avoid waiting if the resource the completion + *	is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ +	unsigned long flags; +	int ret = 1; + +	spin_lock_irqsave(&x->wait.lock, flags); +	if (!x->done) +		ret = 0; +	else +		x->done--; +	spin_unlock_irqrestore(&x->wait.lock, flags); +	return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + *	completion_done - Test to see if a completion has any waiters + *	@x:	completion structure + * + *	Return: 0 if there are waiters (wait_for_completion() in progress) + *		 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ +	unsigned long flags; +	int ret = 1; + +	spin_lock_irqsave(&x->wait.lock, flags); +	if (!x->done) +		ret = 0; +	spin_unlock_irqrestore(&x->wait.lock, flags); +	return ret; +} +EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ac63c9a995a..c1808606ee5f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -513,12 +513,11 @@ static inline void init_hrtick(void)   * might also involve a cross-CPU call to trigger the scheduler on   * the target CPU.   */ -#ifdef CONFIG_SMP  void resched_task(struct task_struct *p)  {  	int cpu; -	assert_raw_spin_locked(&task_rq(p)->lock); +	lockdep_assert_held(&task_rq(p)->lock);  	if (test_tsk_need_resched(p))  		return; @@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)  	set_tsk_need_resched(p);  	cpu = task_cpu(p); -	if (cpu == smp_processor_id()) +	if (cpu == smp_processor_id()) { +		set_preempt_need_resched();  		return; +	}  	/* NEED_RESCHED must be visible before we test polling */  	smp_mb(); @@ -546,6 +547,7 @@ void resched_cpu(int cpu)  	raw_spin_unlock_irqrestore(&rq->lock, flags);  } +#ifdef CONFIG_SMP  #ifdef CONFIG_NO_HZ_COMMON  /*   * In the semi idle case, use the nearest busy cpu for migrating timers @@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)  	}  } -#else /* !CONFIG_SMP */ -void resched_task(struct task_struct *p) -{ -	assert_raw_spin_locked(&task_rq(p)->lock); -	set_tsk_need_resched(p); -}  #endif /* CONFIG_SMP */  #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)  static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)  {  	update_rq_clock(rq); -	sched_info_queued(p); +	sched_info_queued(rq, p);  	p->sched_class->enqueue_task(rq, p, flags);  }  static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)  {  	update_rq_clock(rq); -	sched_info_dequeued(p); +	sched_info_dequeued(rq, p);  	p->sched_class->dequeue_task(rq, p, flags);  } @@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  	 * ttwu() will sort out the placement.  	 */  	WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && -			!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); +			!(task_preempt_count(p) & PREEMPT_ACTIVE));  #ifdef CONFIG_LOCKDEP  	/* @@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  	__set_task_cpu(p, new_cpu);  } +static void __migrate_swap_task(struct task_struct *p, int cpu) +{ +	if (p->on_rq) { +		struct rq *src_rq, *dst_rq; + +		src_rq = task_rq(p); +		dst_rq = cpu_rq(cpu); + +		deactivate_task(src_rq, p, 0); +		set_task_cpu(p, cpu); +		activate_task(dst_rq, p, 0); +		check_preempt_curr(dst_rq, p, 0); +	} else { +		/* +		 * Task isn't running anymore; make it appear like we migrated +		 * it before it went to sleep. This means on wakeup we make the +		 * previous cpu our targer instead of where it really is. +		 */ +		p->wake_cpu = cpu; +	} +} + +struct migration_swap_arg { +	struct task_struct *src_task, *dst_task; +	int src_cpu, dst_cpu; +}; + +static int migrate_swap_stop(void *data) +{ +	struct migration_swap_arg *arg = data; +	struct rq *src_rq, *dst_rq; +	int ret = -EAGAIN; + +	src_rq = cpu_rq(arg->src_cpu); +	dst_rq = cpu_rq(arg->dst_cpu); + +	double_raw_lock(&arg->src_task->pi_lock, +			&arg->dst_task->pi_lock); +	double_rq_lock(src_rq, dst_rq); +	if (task_cpu(arg->dst_task) != arg->dst_cpu) +		goto unlock; + +	if (task_cpu(arg->src_task) != arg->src_cpu) +		goto unlock; + +	if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) +		goto unlock; + +	if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) +		goto unlock; + +	__migrate_swap_task(arg->src_task, arg->dst_cpu); +	__migrate_swap_task(arg->dst_task, arg->src_cpu); + +	ret = 0; + +unlock: +	double_rq_unlock(src_rq, dst_rq); +	raw_spin_unlock(&arg->dst_task->pi_lock); +	raw_spin_unlock(&arg->src_task->pi_lock); + +	return ret; +} + +/* + * Cross migrate two tasks + */ +int migrate_swap(struct task_struct *cur, struct task_struct *p) +{ +	struct migration_swap_arg arg; +	int ret = -EINVAL; + +	arg = (struct migration_swap_arg){ +		.src_task = cur, +		.src_cpu = task_cpu(cur), +		.dst_task = p, +		.dst_cpu = task_cpu(p), +	}; + +	if (arg.src_cpu == arg.dst_cpu) +		goto out; + +	/* +	 * These three tests are all lockless; this is OK since all of them +	 * will be re-checked with proper locks held further down the line. +	 */ +	if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) +		goto out; + +	if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) +		goto out; + +	if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) +		goto out; + +	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); + +out: +	return ret; +} +  struct migration_arg {  	struct task_struct *task;  	int dest_cpu; @@ -1236,9 +1333,9 @@ out:   * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.   */  static inline -int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)  { -	int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); +	cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);  	/*  	 * In order not to call set_task_cpu() on a blocking task we need @@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)  	if (rq->idle_stamp) {  		u64 delta = rq_clock(rq) - rq->idle_stamp; -		u64 max = 2*sysctl_sched_migration_cost; +		u64 max = 2*rq->max_idle_balance_cost; -		if (delta > max) +		update_avg(&rq->avg_idle, delta); + +		if (rq->avg_idle > max)  			rq->avg_idle = max; -		else -			update_avg(&rq->avg_idle, delta); +  		rq->idle_stamp = 0;  	}  #endif @@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void)  void scheduler_ipi(void)  { +	/* +	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting +	 * TIF_NEED_RESCHED remotely (for the first time) will also send +	 * this IPI. +	 */ +	if (tif_need_resched()) +		set_preempt_need_resched(); +  	if (llist_empty(&this_rq()->wake_list)  			&& !tick_nohz_full_cpu(smp_processor_id())  			&& !got_nohz_idle_kick()) @@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)  	if (p->sched_class->task_waking)  		p->sched_class->task_waking(p); -	cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); +	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);  	if (task_cpu(p) != cpu) {  		wake_flags |= WF_MIGRATED;  		set_task_cpu(p, cpu); @@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)   *   * __sched_fork() is basic setup used by init_idle() too:   */ -static void __sched_fork(struct task_struct *p) +static void __sched_fork(unsigned long clone_flags, struct task_struct *p)  {  	p->on_rq			= 0; @@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p)  #ifdef CONFIG_NUMA_BALANCING  	if (p->mm && atomic_read(&p->mm->mm_users) == 1) { -		p->mm->numa_next_scan = jiffies; -		p->mm->numa_next_reset = jiffies; +		p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);  		p->mm->numa_scan_seq = 0;  	} +	if (clone_flags & CLONE_VM) +		p->numa_preferred_nid = current->numa_preferred_nid; +	else +		p->numa_preferred_nid = -1; +  	p->node_stamp = 0ULL;  	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; -	p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;  	p->numa_scan_period = sysctl_numa_balancing_scan_delay;  	p->numa_work.next = &p->numa_work; +	p->numa_faults = NULL; +	p->numa_faults_buffer = NULL; + +	INIT_LIST_HEAD(&p->numa_entry); +	p->numa_group = NULL;  #endif /* CONFIG_NUMA_BALANCING */  } @@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled)  /*   * fork()/clone()-time setup:   */ -void sched_fork(struct task_struct *p) +void sched_fork(unsigned long clone_flags, struct task_struct *p)  {  	unsigned long flags;  	int cpu = get_cpu(); -	__sched_fork(p); +	__sched_fork(clone_flags, p);  	/*  	 * We mark the process as running here. This guarantees that  	 * nobody will actually run it, and a signal or other external @@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p)  #if defined(CONFIG_SMP)  	p->on_cpu = 0;  #endif -#ifdef CONFIG_PREEMPT_COUNT -	/* Want to start with kernel preemption disabled. */ -	task_thread_info(p)->preempt_count = 1; -#endif +	init_task_preempt_count(p);  #ifdef CONFIG_SMP  	plist_node_init(&p->pushable_tasks, MAX_PRIO);  #endif @@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p)  	 *  - cpus_allowed can change in the fork path  	 *  - any previously selected cpu might disappear through hotplug  	 */ -	set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); +	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));  #endif  	/* Initialize new task's runnable average */ @@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,  		    struct task_struct *next)  {  	trace_sched_switch(prev, next); -	sched_info_switch(prev, next); +	sched_info_switch(rq, prev, next);  	perf_event_task_sched_out(prev, next);  	fire_sched_out_preempt_notifiers(prev, next);  	prepare_lock_switch(rq, next); @@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  	if (mm)  		mmdrop(mm);  	if (unlikely(prev_state == TASK_DEAD)) { +		task_numa_free(prev); +  		/*  		 * Remove function-return probe instances associated with this  		 * task and put them back on the free list. @@ -2073,7 +2186,7 @@ void sched_exec(void)  	int dest_cpu;  	raw_spin_lock_irqsave(&p->pi_lock, flags); -	dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); +	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);  	if (dest_cpu == smp_processor_id())  		goto unlock; @@ -2140,6 +2253,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)  	struct rq *rq;  	u64 ns = 0; +#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) +	/* +	 * 64-bit doesn't need locks to atomically read a 64bit value. +	 * So we have a optimization chance when the task's delta_exec is 0. +	 * Reading ->on_cpu is racy, but this is ok. +	 * +	 * If we race with it leaving cpu, we'll take a lock. So we're correct. +	 * If we race with it entering cpu, unaccounted time is 0. This is +	 * indistinguishable from the read occurring a few cycles earlier. +	 */ +	if (!p->on_cpu) +		return p->se.sum_exec_runtime; +#endif +  	rq = task_rq_lock(p, &flags);  	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);  	task_rq_unlock(rq, p, &flags); @@ -2215,7 +2342,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \  				defined(CONFIG_PREEMPT_TRACER)) -void __kprobes add_preempt_count(int val) +void __kprobes preempt_count_add(int val)  {  #ifdef CONFIG_DEBUG_PREEMPT  	/* @@ -2224,7 +2351,7 @@ void __kprobes add_preempt_count(int val)  	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))  		return;  #endif -	preempt_count() += val; +	__preempt_count_add(val);  #ifdef CONFIG_DEBUG_PREEMPT  	/*  	 * Spinlock count overflowing soon? @@ -2235,9 +2362,9 @@ void __kprobes add_preempt_count(int val)  	if (preempt_count() == val)  		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));  } -EXPORT_SYMBOL(add_preempt_count); +EXPORT_SYMBOL(preempt_count_add); -void __kprobes sub_preempt_count(int val) +void __kprobes preempt_count_sub(int val)  {  #ifdef CONFIG_DEBUG_PREEMPT  	/* @@ -2255,9 +2382,9 @@ void __kprobes sub_preempt_count(int val)  	if (preempt_count() == val)  		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); -	preempt_count() -= val; +	__preempt_count_sub(val);  } -EXPORT_SYMBOL(sub_preempt_count); +EXPORT_SYMBOL(preempt_count_sub);  #endif @@ -2430,6 +2557,7 @@ need_resched:  	put_prev_task(rq, prev);  	next = pick_next_task(rq);  	clear_tsk_need_resched(prev); +	clear_preempt_need_resched();  	rq->skip_clock_update = 0;  	if (likely(prev != next)) { @@ -2520,9 +2648,9 @@ asmlinkage void __sched notrace preempt_schedule(void)  		return;  	do { -		add_preempt_count_notrace(PREEMPT_ACTIVE); +		__preempt_count_add(PREEMPT_ACTIVE);  		__schedule(); -		sub_preempt_count_notrace(PREEMPT_ACTIVE); +		__preempt_count_sub(PREEMPT_ACTIVE);  		/*  		 * Check again in case we missed a preemption opportunity @@ -2541,20 +2669,19 @@ EXPORT_SYMBOL(preempt_schedule);   */  asmlinkage void __sched preempt_schedule_irq(void)  { -	struct thread_info *ti = current_thread_info();  	enum ctx_state prev_state;  	/* Catch callers which need to be fixed */ -	BUG_ON(ti->preempt_count || !irqs_disabled()); +	BUG_ON(preempt_count() || !irqs_disabled());  	prev_state = exception_enter();  	do { -		add_preempt_count(PREEMPT_ACTIVE); +		__preempt_count_add(PREEMPT_ACTIVE);  		local_irq_enable();  		__schedule();  		local_irq_disable(); -		sub_preempt_count(PREEMPT_ACTIVE); +		__preempt_count_sub(PREEMPT_ACTIVE);  		/*  		 * Check again in case we missed a preemption opportunity @@ -2575,393 +2702,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,  }  EXPORT_SYMBOL(default_wake_function); -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, -			int nr_exclusive, int wake_flags, void *key) -{ -	wait_queue_t *curr, *next; - -	list_for_each_entry_safe(curr, next, &q->task_list, task_list) { -		unsigned flags = curr->flags; - -		if (curr->func(curr, mode, wake_flags, key) && -				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) -			break; -	} -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: is directly passed to the wakeup function - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, -			int nr_exclusive, void *key) -{ -	unsigned long flags; - -	spin_lock_irqsave(&q->lock, flags); -	__wake_up_common(q, mode, nr_exclusive, 0, key); -	spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) -{ -	__wake_up_common(q, mode, nr, 0, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_locked); - -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) -{ -	__wake_up_common(q, mode, 1, 0, key); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key); - -/** - * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: opaque value to be passed to wakeup targets - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, -			int nr_exclusive, void *key) -{ -	unsigned long flags; -	int wake_flags = WF_SYNC; - -	if (unlikely(!q)) -		return; - -	if (unlikely(nr_exclusive != 1)) -		wake_flags = 0; - -	spin_lock_irqsave(&q->lock, flags); -	__wake_up_common(q, mode, nr_exclusive, wake_flags, key); -	spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync_key); - -/* - * __wake_up_sync - see __wake_up_sync_key() - */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ -	__wake_up_sync_key(q, mode, nr_exclusive, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */ - -/** - * complete: - signals a single thread waiting on this completion - * @x:  holds the state of this particular completion - * - * This will wake up a single thread waiting on this completion. Threads will be - * awakened in the same order in which they were queued. - * - * See also complete_all(), wait_for_completion() and related routines. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete(struct completion *x) -{ -	unsigned long flags; - -	spin_lock_irqsave(&x->wait.lock, flags); -	x->done++; -	__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); -	spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -/** - * complete_all: - signals all threads waiting on this completion - * @x:  holds the state of this particular completion - * - * This will wake up all threads waiting on this particular completion event. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete_all(struct completion *x) -{ -	unsigned long flags; - -	spin_lock_irqsave(&x->wait.lock, flags); -	x->done += UINT_MAX/2; -	__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); -	spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -static inline long __sched -do_wait_for_common(struct completion *x, -		   long (*action)(long), long timeout, int state) -{ -	if (!x->done) { -		DECLARE_WAITQUEUE(wait, current); - -		__add_wait_queue_tail_exclusive(&x->wait, &wait); -		do { -			if (signal_pending_state(state, current)) { -				timeout = -ERESTARTSYS; -				break; -			} -			__set_current_state(state); -			spin_unlock_irq(&x->wait.lock); -			timeout = action(timeout); -			spin_lock_irq(&x->wait.lock); -		} while (!x->done && timeout); -		__remove_wait_queue(&x->wait, &wait); -		if (!x->done) -			return timeout; -	} -	x->done--; -	return timeout ?: 1; -} - -static inline long __sched -__wait_for_common(struct completion *x, -		  long (*action)(long), long timeout, int state) -{ -	might_sleep(); - -	spin_lock_irq(&x->wait.lock); -	timeout = do_wait_for_common(x, action, timeout, state); -	spin_unlock_irq(&x->wait.lock); -	return timeout; -} - -static long __sched -wait_for_common(struct completion *x, long timeout, int state) -{ -	return __wait_for_common(x, schedule_timeout, timeout, state); -} - -static long __sched -wait_for_common_io(struct completion *x, long timeout, int state) -{ -	return __wait_for_common(x, io_schedule_timeout, timeout, state); -} - -/** - * wait_for_completion: - waits for completion of a task - * @x:  holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. - * - * See also similar routines (i.e. wait_for_completion_timeout()) with timeout - * and interrupt capability. Also see complete(). - */ -void __sched wait_for_completion(struct completion *x) -{ -	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion); - -/** - * wait_for_completion_timeout: - waits for completion of a task (w/timeout) - * @x:  holds the state of this particular completion - * @timeout:  timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ -	return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_timeout); - -/** - * wait_for_completion_io: - waits for completion of a task - * @x:  holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. - */ -void __sched wait_for_completion_io(struct completion *x) -{ -	wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io); - -/** - * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) - * @x:  holds the state of this particular completion - * @timeout:  timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) -{ -	return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io_timeout); - -/** - * wait_for_completion_interruptible: - waits for completion of a task (w/intr) - * @x:  holds the state of this particular completion - * - * This waits for completion of a specific task to be signaled. It is - * interruptible. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_interruptible(struct completion *x) -{ -	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); -	if (t == -ERESTARTSYS) -		return t; -	return 0; -} -EXPORT_SYMBOL(wait_for_completion_interruptible); - -/** - * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) - * @x:  holds the state of this particular completion - * @timeout:  timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. It is interruptible. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_interruptible_timeout(struct completion *x, -					  unsigned long timeout) -{ -	return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - -/** - * wait_for_completion_killable: - waits for completion of a task (killable) - * @x:  holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It can be - * interrupted by a kill signal. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_killable(struct completion *x) -{ -	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); -	if (t == -ERESTARTSYS) -		return t; -	return 0; -} -EXPORT_SYMBOL(wait_for_completion_killable); - -/** - * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) - * @x:  holds the state of this particular completion - * @timeout:  timeout value in jiffies - * - * This waits for either a completion of a specific task to be - * signaled or for a specified timeout to expire. It can be - * interrupted by a kill signal. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_killable_timeout(struct completion *x, -				     unsigned long timeout) -{ -	return wait_for_common(x, timeout, TASK_KILLABLE); -} -EXPORT_SYMBOL(wait_for_completion_killable_timeout); - -/** - *	try_wait_for_completion - try to decrement a completion without blocking - *	@x:	completion structure - * - *	Return: 0 if a decrement cannot be done without blocking - *		 1 if a decrement succeeded. - * - *	If a completion is being used as a counting completion, - *	attempt to decrement the counter without blocking. This - *	enables us to avoid waiting if the resource the completion - *	is protecting is not available. - */ -bool try_wait_for_completion(struct completion *x) -{ -	unsigned long flags; -	int ret = 1; - -	spin_lock_irqsave(&x->wait.lock, flags); -	if (!x->done) -		ret = 0; -	else -		x->done--; -	spin_unlock_irqrestore(&x->wait.lock, flags); -	return ret; -} -EXPORT_SYMBOL(try_wait_for_completion); - -/** - *	completion_done - Test to see if a completion has any waiters - *	@x:	completion structure - * - *	Return: 0 if there are waiters (wait_for_completion() in progress) - *		 1 if there are no waiters. - * - */ -bool completion_done(struct completion *x) -{ -	unsigned long flags; -	int ret = 1; - -	spin_lock_irqsave(&x->wait.lock, flags); -	if (!x->done) -		ret = 0; -	spin_unlock_irqrestore(&x->wait.lock, flags); -	return ret; -} -EXPORT_SYMBOL(completion_done); -  static long __sched  sleep_on_common(wait_queue_head_t *q, int state, long timeout)  { @@ -3598,13 +3338,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)  	struct task_struct *p;  	int retval; -	get_online_cpus();  	rcu_read_lock();  	p = find_process_by_pid(pid);  	if (!p) {  		rcu_read_unlock(); -		put_online_cpus();  		return -ESRCH;  	} @@ -3661,7 +3399,6 @@ out_free_cpus_allowed:  	free_cpumask_var(cpus_allowed);  out_put_task:  	put_task_struct(p); -	put_online_cpus();  	return retval;  } @@ -3706,7 +3443,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)  	unsigned long flags;  	int retval; -	get_online_cpus();  	rcu_read_lock();  	retval = -ESRCH; @@ -3719,12 +3455,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)  		goto out_unlock;  	raw_spin_lock_irqsave(&p->pi_lock, flags); -	cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); +	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);  out_unlock:  	rcu_read_unlock(); -	put_online_cpus();  	return retval;  } @@ -3794,16 +3529,11 @@ SYSCALL_DEFINE0(sched_yield)  	return 0;  } -static inline int should_resched(void) -{ -	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); -} -  static void __cond_resched(void)  { -	add_preempt_count(PREEMPT_ACTIVE); +	__preempt_count_add(PREEMPT_ACTIVE);  	__schedule(); -	sub_preempt_count(PREEMPT_ACTIVE); +	__preempt_count_sub(PREEMPT_ACTIVE);  }  int __sched _cond_resched(void) @@ -4186,7 +3916,7 @@ void init_idle(struct task_struct *idle, int cpu)  	raw_spin_lock_irqsave(&rq->lock, flags); -	__sched_fork(idle); +	__sched_fork(0, idle);  	idle->state = TASK_RUNNING;  	idle->se.exec_start = sched_clock(); @@ -4212,7 +3942,7 @@ void init_idle(struct task_struct *idle, int cpu)  	raw_spin_unlock_irqrestore(&rq->lock, flags);  	/* Set the preempt count _outside_ the spinlocks! */ -	task_thread_info(idle)->preempt_count = 0; +	init_idle_preempt_count(idle, cpu);  	/*  	 * The idle tasks have their own, simple scheduling class: @@ -4346,6 +4076,53 @@ fail:  	return ret;  } +#ifdef CONFIG_NUMA_BALANCING +/* Migrate current task p to target_cpu */ +int migrate_task_to(struct task_struct *p, int target_cpu) +{ +	struct migration_arg arg = { p, target_cpu }; +	int curr_cpu = task_cpu(p); + +	if (curr_cpu == target_cpu) +		return 0; + +	if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) +		return -EINVAL; + +	/* TODO: This is not properly updating schedstats */ + +	return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); +} + +/* + * Requeue a task on a given node and accurately track the number of NUMA + * tasks on the runqueues + */ +void sched_setnuma(struct task_struct *p, int nid) +{ +	struct rq *rq; +	unsigned long flags; +	bool on_rq, running; + +	rq = task_rq_lock(p, &flags); +	on_rq = p->on_rq; +	running = task_current(rq, p); + +	if (on_rq) +		dequeue_task(rq, p, 0); +	if (running) +		p->sched_class->put_prev_task(rq, p); + +	p->numa_preferred_nid = nid; + +	if (running) +		p->sched_class->set_curr_task(rq); +	if (on_rq) +		enqueue_task(rq, p, 0); +	task_rq_unlock(rq, p, &flags); +} +#endif +  /*   * migration_cpu_stop - this will be executed by a highprio stopper thread   * and performs thread migration by bumping thread off CPU then @@ -5119,6 +4896,9 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)  DEFINE_PER_CPU(struct sched_domain *, sd_llc);  DEFINE_PER_CPU(int, sd_llc_size);  DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain *, sd_numa); +DEFINE_PER_CPU(struct sched_domain *, sd_busy); +DEFINE_PER_CPU(struct sched_domain *, sd_asym);  static void update_top_cache_domain(int cpu)  { @@ -5130,11 +4910,18 @@ static void update_top_cache_domain(int cpu)  	if (sd) {  		id = cpumask_first(sched_domain_span(sd));  		size = cpumask_weight(sched_domain_span(sd)); +		rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);  	}  	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);  	per_cpu(sd_llc_size, cpu) = size;  	per_cpu(sd_llc_id, cpu) = id; + +	sd = lowest_flag_domain(cpu, SD_NUMA); +	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); + +	sd = highest_flag_domain(cpu, SD_ASYM_PACKING); +	rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);  }  /* @@ -5654,6 +5441,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)  					| 0*SD_SHARE_PKG_RESOURCES  					| 1*SD_SERIALIZE  					| 0*SD_PREFER_SIBLING +					| 1*SD_NUMA  					| sd_local_flags(level)  					,  		.last_balance		= jiffies, @@ -6335,14 +6123,17 @@ void __init sched_init_smp(void)  	sched_init_numa(); -	get_online_cpus(); +	/* +	 * There's no userspace yet to cause hotplug operations; hence all the +	 * cpu masks are stable and all blatant races in the below code cannot +	 * happen. +	 */  	mutex_lock(&sched_domains_mutex);  	init_sched_domains(cpu_active_mask);  	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);  	if (cpumask_empty(non_isolated_cpus))  		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);  	mutex_unlock(&sched_domains_mutex); -	put_online_cpus();  	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);  	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); @@ -6505,6 +6296,7 @@ void __init sched_init(void)  		rq->online = 0;  		rq->idle_stamp = 0;  		rq->avg_idle = 2*sysctl_sched_migration_cost; +		rq->max_idle_balance_cost = sysctl_sched_migration_cost;  		INIT_LIST_HEAD(&rq->cfs_tasks); @@ -7277,7 +7069,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)  	runtime_enabled = quota != RUNTIME_INF;  	runtime_was_enabled = cfs_b->quota != RUNTIME_INF; -	account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); +	/* +	 * If we need to toggle cfs_bandwidth_used, off->on must occur +	 * before making related changes, and on->off must occur afterwards +	 */ +	if (runtime_enabled && !runtime_was_enabled) +		cfs_bandwidth_usage_inc();  	raw_spin_lock_irq(&cfs_b->lock);  	cfs_b->period = ns_to_ktime(period);  	cfs_b->quota = quota; @@ -7303,6 +7100,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)  			unthrottle_cfs_rq(cfs_rq);  		raw_spin_unlock_irq(&rq->lock);  	} +	if (runtime_was_enabled && !runtime_enabled) +		cfs_bandwidth_usage_dec();  out_unlock:  	mutex_unlock(&cfs_constraints_mutex); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 196559994f7c..5c34d1817e8f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -15,6 +15,7 @@  #include <linux/seq_file.h>  #include <linux/kallsyms.h>  #include <linux/utsname.h> +#include <linux/mempolicy.h>  #include "sched.h" @@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)  	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",  		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);  #endif +#ifdef CONFIG_NUMA_BALANCING +	SEQ_printf(m, " %d", cpu_to_node(task_cpu(p))); +#endif  #ifdef CONFIG_CGROUP_SCHED  	SEQ_printf(m, " %s", task_group_path(task_group(p)));  #endif @@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)  	read_lock_irqsave(&tasklist_lock, flags);  	do_each_thread(g, p) { -		if (!p->on_rq || task_cpu(p) != rq_cpu) +		if (task_cpu(p) != rq_cpu)  			continue;  		print_task(m, rq, p); @@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)  			atomic_read(&cfs_rq->tg->runnable_avg));  #endif  #endif +#ifdef CONFIG_CFS_BANDWIDTH +	SEQ_printf(m, "  .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", +			cfs_rq->tg->cfs_bandwidth.timer_active); +	SEQ_printf(m, "  .%-30s: %d\n", "throttled", +			cfs_rq->throttled); +	SEQ_printf(m, "  .%-30s: %d\n", "throttle_count", +			cfs_rq->throttle_count); +#endif  #ifdef CONFIG_FAIR_GROUP_SCHED  	print_cfs_group_stats(m, cpu, cfs_rq->tg); @@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)  	cpu_clk = local_clock();  	local_irq_restore(flags); -	SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", +	SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",  		init_utsname()->release,  		(int)strcspn(init_utsname()->version, " "),  		init_utsname()->version); @@ -488,6 +500,56 @@ static int __init init_sched_debug_procfs(void)  __initcall(init_sched_debug_procfs); +#define __P(F) \ +	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) +#define P(F) \ +	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) \ +	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) \ +	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + + +static void sched_show_numa(struct task_struct *p, struct seq_file *m) +{ +#ifdef CONFIG_NUMA_BALANCING +	struct mempolicy *pol; +	int node, i; + +	if (p->mm) +		P(mm->numa_scan_seq); + +	task_lock(p); +	pol = p->mempolicy; +	if (pol && !(pol->flags & MPOL_F_MORON)) +		pol = NULL; +	mpol_get(pol); +	task_unlock(p); + +	SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0)); + +	for_each_online_node(node) { +		for (i = 0; i < 2; i++) { +			unsigned long nr_faults = -1; +			int cpu_current, home_node; + +			if (p->numa_faults) +				nr_faults = p->numa_faults[2*node + i]; + +			cpu_current = !i ? (task_node(p) == node) : +				(pol && node_isset(node, pol->v.nodes)); + +			home_node = (p->numa_preferred_nid == node); + +			SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n", +				i, node, cpu_current, home_node, nr_faults); +		} +	} + +	mpol_put(pol); +#endif +} +  void proc_sched_show_task(struct task_struct *p, struct seq_file *m)  {  	unsigned long nr_switches; @@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)  		SEQ_printf(m, "%-45s:%21Ld\n",  			   "clock-delta", (long long)(t1-t0));  	} + +	sched_show_numa(p, m);  }  void proc_sched_set_task(struct task_struct *p) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c70201fbc61..e8b652ebe027 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)  }  #ifdef CONFIG_SMP +static unsigned long task_h_load(struct task_struct *p); +  static inline void __update_task_entity_contrib(struct sched_entity *se);  /* Give new task start runnable values to heavy its load in infant time */ @@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)  #ifdef CONFIG_NUMA_BALANCING  /* - * numa task sample period in ms + * Approximate time to scan a full NUMA task in ms. The task scan period is + * calculated based on the tasks virtual memory size and + * numa_balancing_scan_size.   */ -unsigned int sysctl_numa_balancing_scan_period_min = 100; -unsigned int sysctl_numa_balancing_scan_period_max = 100*50; -unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; +unsigned int sysctl_numa_balancing_scan_period_min = 1000; +unsigned int sysctl_numa_balancing_scan_period_max = 60000;  /* Portion of address space to scan in MB */  unsigned int sysctl_numa_balancing_scan_size = 256; @@ -830,41 +833,835 @@ unsigned int sysctl_numa_balancing_scan_size = 256;  /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */  unsigned int sysctl_numa_balancing_scan_delay = 1000; -static void task_numa_placement(struct task_struct *p) +/* + * After skipping a page migration on a shared page, skip N more numa page + * migrations unconditionally. This reduces the number of NUMA migrations + * in shared memory workloads, and has the effect of pulling tasks towards + * where their memory lives, over pulling the memory towards the task. + */ +unsigned int sysctl_numa_balancing_migrate_deferred = 16; + +static unsigned int task_nr_scan_windows(struct task_struct *p) +{ +	unsigned long rss = 0; +	unsigned long nr_scan_pages; + +	/* +	 * Calculations based on RSS as non-present and empty pages are skipped +	 * by the PTE scanner and NUMA hinting faults should be trapped based +	 * on resident pages +	 */ +	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); +	rss = get_mm_rss(p->mm); +	if (!rss) +		rss = nr_scan_pages; + +	rss = round_up(rss, nr_scan_pages); +	return rss / nr_scan_pages; +} + +/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ +#define MAX_SCAN_WINDOW 2560 + +static unsigned int task_scan_min(struct task_struct *p) +{ +	unsigned int scan, floor; +	unsigned int windows = 1; + +	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) +		windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; +	floor = 1000 / windows; + +	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); +	return max_t(unsigned int, floor, scan); +} + +static unsigned int task_scan_max(struct task_struct *p) +{ +	unsigned int smin = task_scan_min(p); +	unsigned int smax; + +	/* Watch for min being lower than max due to floor calculations */ +	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); +	return max(smin, smax); +} + +/* + * Once a preferred node is selected the scheduler balancer will prefer moving + * a task to that node for sysctl_numa_balancing_settle_count number of PTE + * scans. This will give the process the chance to accumulate more faults on + * the preferred node but still allow the scheduler to move the task again if + * the nodes CPUs are overloaded. + */ +unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4; + +static void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ +	rq->nr_numa_running += (p->numa_preferred_nid != -1); +	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); +} + +static void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ +	rq->nr_numa_running -= (p->numa_preferred_nid != -1); +	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); +} + +struct numa_group { +	atomic_t refcount; + +	spinlock_t lock; /* nr_tasks, tasks */ +	int nr_tasks; +	pid_t gid; +	struct list_head task_list; + +	struct rcu_head rcu; +	unsigned long total_faults; +	unsigned long faults[0]; +}; + +pid_t task_numa_group_id(struct task_struct *p) +{ +	return p->numa_group ? p->numa_group->gid : 0; +} + +static inline int task_faults_idx(int nid, int priv) +{ +	return 2 * nid + priv; +} + +static inline unsigned long task_faults(struct task_struct *p, int nid) +{ +	if (!p->numa_faults) +		return 0; + +	return p->numa_faults[task_faults_idx(nid, 0)] + +		p->numa_faults[task_faults_idx(nid, 1)]; +} + +static inline unsigned long group_faults(struct task_struct *p, int nid) +{ +	if (!p->numa_group) +		return 0; + +	return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1]; +} + +/* + * These return the fraction of accesses done by a particular task, or + * task group, on a particular numa node.  The group weight is given a + * larger multiplier, in order to group tasks together that are almost + * evenly spread out between numa nodes. + */ +static inline unsigned long task_weight(struct task_struct *p, int nid) +{ +	unsigned long total_faults; + +	if (!p->numa_faults) +		return 0; + +	total_faults = p->total_numa_faults; + +	if (!total_faults) +		return 0; + +	return 1000 * task_faults(p, nid) / total_faults; +} + +static inline unsigned long group_weight(struct task_struct *p, int nid) +{ +	if (!p->numa_group || !p->numa_group->total_faults) +		return 0; + +	return 1000 * group_faults(p, nid) / p->numa_group->total_faults; +} + +static unsigned long weighted_cpuload(const int cpu); +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static unsigned long power_of(int cpu); +static long effective_load(struct task_group *tg, int cpu, long wl, long wg); + +/* Cached statistics for all CPUs within a node */ +struct numa_stats { +	unsigned long nr_running; +	unsigned long load; + +	/* Total compute capacity of CPUs on a node */ +	unsigned long power; + +	/* Approximate capacity in terms of runnable tasks on a node */ +	unsigned long capacity; +	int has_capacity; +}; + +/* + * XXX borrowed from update_sg_lb_stats + */ +static void update_numa_stats(struct numa_stats *ns, int nid)  { -	int seq; +	int cpu, cpus = 0; + +	memset(ns, 0, sizeof(*ns)); +	for_each_cpu(cpu, cpumask_of_node(nid)) { +		struct rq *rq = cpu_rq(cpu); + +		ns->nr_running += rq->nr_running; +		ns->load += weighted_cpuload(cpu); +		ns->power += power_of(cpu); + +		cpus++; +	} -	if (!p->mm)	/* for example, ksmd faulting in a user's mm */ +	/* +	 * If we raced with hotplug and there are no CPUs left in our mask +	 * the @ns structure is NULL'ed and task_numa_compare() will +	 * not find this node attractive. +	 * +	 * We'll either bail at !has_capacity, or we'll detect a huge imbalance +	 * and bail there. +	 */ +	if (!cpus)  		return; + +	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; +	ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); +	ns->has_capacity = (ns->nr_running < ns->capacity); +} + +struct task_numa_env { +	struct task_struct *p; + +	int src_cpu, src_nid; +	int dst_cpu, dst_nid; + +	struct numa_stats src_stats, dst_stats; + +	int imbalance_pct, idx; + +	struct task_struct *best_task; +	long best_imp; +	int best_cpu; +}; + +static void task_numa_assign(struct task_numa_env *env, +			     struct task_struct *p, long imp) +{ +	if (env->best_task) +		put_task_struct(env->best_task); +	if (p) +		get_task_struct(p); + +	env->best_task = p; +	env->best_imp = imp; +	env->best_cpu = env->dst_cpu; +} + +/* + * This checks if the overall compute and NUMA accesses of the system would + * be improved if the source tasks was migrated to the target dst_cpu taking + * into account that it might be best if task running on the dst_cpu should + * be exchanged with the source task + */ +static void task_numa_compare(struct task_numa_env *env, +			      long taskimp, long groupimp) +{ +	struct rq *src_rq = cpu_rq(env->src_cpu); +	struct rq *dst_rq = cpu_rq(env->dst_cpu); +	struct task_struct *cur; +	long dst_load, src_load; +	long load; +	long imp = (groupimp > 0) ? groupimp : taskimp; + +	rcu_read_lock(); +	cur = ACCESS_ONCE(dst_rq->curr); +	if (cur->pid == 0) /* idle */ +		cur = NULL; + +	/* +	 * "imp" is the fault differential for the source task between the +	 * source and destination node. Calculate the total differential for +	 * the source task and potential destination task. The more negative +	 * the value is, the more rmeote accesses that would be expected to +	 * be incurred if the tasks were swapped. +	 */ +	if (cur) { +		/* Skip this swap candidate if cannot move to the source cpu */ +		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur))) +			goto unlock; + +		/* +		 * If dst and source tasks are in the same NUMA group, or not +		 * in any group then look only at task weights. +		 */ +		if (cur->numa_group == env->p->numa_group) { +			imp = taskimp + task_weight(cur, env->src_nid) - +			      task_weight(cur, env->dst_nid); +			/* +			 * Add some hysteresis to prevent swapping the +			 * tasks within a group over tiny differences. +			 */ +			if (cur->numa_group) +				imp -= imp/16; +		} else { +			/* +			 * Compare the group weights. If a task is all by +			 * itself (not part of a group), use the task weight +			 * instead. +			 */ +			if (env->p->numa_group) +				imp = groupimp; +			else +				imp = taskimp; + +			if (cur->numa_group) +				imp += group_weight(cur, env->src_nid) - +				       group_weight(cur, env->dst_nid); +			else +				imp += task_weight(cur, env->src_nid) - +				       task_weight(cur, env->dst_nid); +		} +	} + +	if (imp < env->best_imp) +		goto unlock; + +	if (!cur) { +		/* Is there capacity at our destination? */ +		if (env->src_stats.has_capacity && +		    !env->dst_stats.has_capacity) +			goto unlock; + +		goto balance; +	} + +	/* Balance doesn't matter much if we're running a task per cpu */ +	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) +		goto assign; + +	/* +	 * In the overloaded case, try and keep the load balanced. +	 */ +balance: +	dst_load = env->dst_stats.load; +	src_load = env->src_stats.load; + +	/* XXX missing power terms */ +	load = task_h_load(env->p); +	dst_load += load; +	src_load -= load; + +	if (cur) { +		load = task_h_load(cur); +		dst_load -= load; +		src_load += load; +	} + +	/* make src_load the smaller */ +	if (dst_load < src_load) +		swap(dst_load, src_load); + +	if (src_load * env->imbalance_pct < dst_load * 100) +		goto unlock; + +assign: +	task_numa_assign(env, cur, imp); +unlock: +	rcu_read_unlock(); +} + +static void task_numa_find_cpu(struct task_numa_env *env, +				long taskimp, long groupimp) +{ +	int cpu; + +	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { +		/* Skip this CPU if the source task cannot migrate */ +		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p))) +			continue; + +		env->dst_cpu = cpu; +		task_numa_compare(env, taskimp, groupimp); +	} +} + +static int task_numa_migrate(struct task_struct *p) +{ +	struct task_numa_env env = { +		.p = p, + +		.src_cpu = task_cpu(p), +		.src_nid = task_node(p), + +		.imbalance_pct = 112, + +		.best_task = NULL, +		.best_imp = 0, +		.best_cpu = -1 +	}; +	struct sched_domain *sd; +	unsigned long taskweight, groupweight; +	int nid, ret; +	long taskimp, groupimp; + +	/* +	 * Pick the lowest SD_NUMA domain, as that would have the smallest +	 * imbalance and would be the first to start moving tasks about. +	 * +	 * And we want to avoid any moving of tasks about, as that would create +	 * random movement of tasks -- counter the numa conditions we're trying +	 * to satisfy here. +	 */ +	rcu_read_lock(); +	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); +	if (sd) +		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; +	rcu_read_unlock(); + +	/* +	 * Cpusets can break the scheduler domain tree into smaller +	 * balance domains, some of which do not cross NUMA boundaries. +	 * Tasks that are "trapped" in such domains cannot be migrated +	 * elsewhere, so there is no point in (re)trying. +	 */ +	if (unlikely(!sd)) { +		p->numa_preferred_nid = cpu_to_node(task_cpu(p)); +		return -EINVAL; +	} + +	taskweight = task_weight(p, env.src_nid); +	groupweight = group_weight(p, env.src_nid); +	update_numa_stats(&env.src_stats, env.src_nid); +	env.dst_nid = p->numa_preferred_nid; +	taskimp = task_weight(p, env.dst_nid) - taskweight; +	groupimp = group_weight(p, env.dst_nid) - groupweight; +	update_numa_stats(&env.dst_stats, env.dst_nid); + +	/* If the preferred nid has capacity, try to use it. */ +	if (env.dst_stats.has_capacity) +		task_numa_find_cpu(&env, taskimp, groupimp); + +	/* No space available on the preferred nid. Look elsewhere. */ +	if (env.best_cpu == -1) { +		for_each_online_node(nid) { +			if (nid == env.src_nid || nid == p->numa_preferred_nid) +				continue; + +			/* Only consider nodes where both task and groups benefit */ +			taskimp = task_weight(p, nid) - taskweight; +			groupimp = group_weight(p, nid) - groupweight; +			if (taskimp < 0 && groupimp < 0) +				continue; + +			env.dst_nid = nid; +			update_numa_stats(&env.dst_stats, env.dst_nid); +			task_numa_find_cpu(&env, taskimp, groupimp); +		} +	} + +	/* No better CPU than the current one was found. */ +	if (env.best_cpu == -1) +		return -EAGAIN; + +	sched_setnuma(p, env.dst_nid); + +	/* +	 * Reset the scan period if the task is being rescheduled on an +	 * alternative node to recheck if the tasks is now properly placed. +	 */ +	p->numa_scan_period = task_scan_min(p); + +	if (env.best_task == NULL) { +		int ret = migrate_task_to(p, env.best_cpu); +		return ret; +	} + +	ret = migrate_swap(p, env.best_task); +	put_task_struct(env.best_task); +	return ret; +} + +/* Attempt to migrate a task to a CPU on the preferred node. */ +static void numa_migrate_preferred(struct task_struct *p) +{ +	/* This task has no NUMA fault statistics yet */ +	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) +		return; + +	/* Periodically retry migrating the task to the preferred node */ +	p->numa_migrate_retry = jiffies + HZ; + +	/* Success if task is already running on preferred CPU */ +	if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) +		return; + +	/* Otherwise, try migrate to a CPU on the preferred node */ +	task_numa_migrate(p); +} + +/* + * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS + * increments. The more local the fault statistics are, the higher the scan + * period will be for the next scan window. If local/remote ratio is below + * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the + * scan period will decrease + */ +#define NUMA_PERIOD_SLOTS 10 +#define NUMA_PERIOD_THRESHOLD 3 + +/* + * Increase the scan period (slow down scanning) if the majority of + * our memory is already on our local node, or if the majority of + * the page accesses are shared with other processes. + * Otherwise, decrease the scan period. + */ +static void update_task_scan_period(struct task_struct *p, +			unsigned long shared, unsigned long private) +{ +	unsigned int period_slot; +	int ratio; +	int diff; + +	unsigned long remote = p->numa_faults_locality[0]; +	unsigned long local = p->numa_faults_locality[1]; + +	/* +	 * If there were no record hinting faults then either the task is +	 * completely idle or all activity is areas that are not of interest +	 * to automatic numa balancing. Scan slower +	 */ +	if (local + shared == 0) { +		p->numa_scan_period = min(p->numa_scan_period_max, +			p->numa_scan_period << 1); + +		p->mm->numa_next_scan = jiffies + +			msecs_to_jiffies(p->numa_scan_period); + +		return; +	} + +	/* +	 * Prepare to scale scan period relative to the current period. +	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same +	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) +	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) +	 */ +	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); +	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); +	if (ratio >= NUMA_PERIOD_THRESHOLD) { +		int slot = ratio - NUMA_PERIOD_THRESHOLD; +		if (!slot) +			slot = 1; +		diff = slot * period_slot; +	} else { +		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; + +		/* +		 * Scale scan rate increases based on sharing. There is an +		 * inverse relationship between the degree of sharing and +		 * the adjustment made to the scanning period. Broadly +		 * speaking the intent is that there is little point +		 * scanning faster if shared accesses dominate as it may +		 * simply bounce migrations uselessly +		 */ +		period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS); +		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); +		diff = (diff * ratio) / NUMA_PERIOD_SLOTS; +	} + +	p->numa_scan_period = clamp(p->numa_scan_period + diff, +			task_scan_min(p), task_scan_max(p)); +	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); +} + +static void task_numa_placement(struct task_struct *p) +{ +	int seq, nid, max_nid = -1, max_group_nid = -1; +	unsigned long max_faults = 0, max_group_faults = 0; +	unsigned long fault_types[2] = { 0, 0 }; +	spinlock_t *group_lock = NULL; +  	seq = ACCESS_ONCE(p->mm->numa_scan_seq);  	if (p->numa_scan_seq == seq)  		return;  	p->numa_scan_seq = seq; +	p->numa_scan_period_max = task_scan_max(p); + +	/* If the task is part of a group prevent parallel updates to group stats */ +	if (p->numa_group) { +		group_lock = &p->numa_group->lock; +		spin_lock(group_lock); +	} + +	/* Find the node with the highest number of faults */ +	for_each_online_node(nid) { +		unsigned long faults = 0, group_faults = 0; +		int priv, i; + +		for (priv = 0; priv < 2; priv++) { +			long diff; + +			i = task_faults_idx(nid, priv); +			diff = -p->numa_faults[i]; + +			/* Decay existing window, copy faults since last scan */ +			p->numa_faults[i] >>= 1; +			p->numa_faults[i] += p->numa_faults_buffer[i]; +			fault_types[priv] += p->numa_faults_buffer[i]; +			p->numa_faults_buffer[i] = 0; + +			faults += p->numa_faults[i]; +			diff += p->numa_faults[i]; +			p->total_numa_faults += diff; +			if (p->numa_group) { +				/* safe because we can only change our own group */ +				p->numa_group->faults[i] += diff; +				p->numa_group->total_faults += diff; +				group_faults += p->numa_group->faults[i]; +			} +		} + +		if (faults > max_faults) { +			max_faults = faults; +			max_nid = nid; +		} + +		if (group_faults > max_group_faults) { +			max_group_faults = group_faults; +			max_group_nid = nid; +		} +	} + +	update_task_scan_period(p, fault_types[0], fault_types[1]); + +	if (p->numa_group) { +		/* +		 * If the preferred task and group nids are different, +		 * iterate over the nodes again to find the best place. +		 */ +		if (max_nid != max_group_nid) { +			unsigned long weight, max_weight = 0; + +			for_each_online_node(nid) { +				weight = task_weight(p, nid) + group_weight(p, nid); +				if (weight > max_weight) { +					max_weight = weight; +					max_nid = nid; +				} +			} +		} -	/* FIXME: Scheduling placement policy hints go here */ +		spin_unlock(group_lock); +	} + +	/* Preferred node as the node with the most faults */ +	if (max_faults && max_nid != p->numa_preferred_nid) { +		/* Update the preferred nid and migrate task if possible */ +		sched_setnuma(p, max_nid); +		numa_migrate_preferred(p); +	} +} + +static inline int get_numa_group(struct numa_group *grp) +{ +	return atomic_inc_not_zero(&grp->refcount); +} + +static inline void put_numa_group(struct numa_group *grp) +{ +	if (atomic_dec_and_test(&grp->refcount)) +		kfree_rcu(grp, rcu); +} + +static void task_numa_group(struct task_struct *p, int cpupid, int flags, +			int *priv) +{ +	struct numa_group *grp, *my_grp; +	struct task_struct *tsk; +	bool join = false; +	int cpu = cpupid_to_cpu(cpupid); +	int i; + +	if (unlikely(!p->numa_group)) { +		unsigned int size = sizeof(struct numa_group) + +				    2*nr_node_ids*sizeof(unsigned long); + +		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); +		if (!grp) +			return; + +		atomic_set(&grp->refcount, 1); +		spin_lock_init(&grp->lock); +		INIT_LIST_HEAD(&grp->task_list); +		grp->gid = p->pid; + +		for (i = 0; i < 2*nr_node_ids; i++) +			grp->faults[i] = p->numa_faults[i]; + +		grp->total_faults = p->total_numa_faults; + +		list_add(&p->numa_entry, &grp->task_list); +		grp->nr_tasks++; +		rcu_assign_pointer(p->numa_group, grp); +	} + +	rcu_read_lock(); +	tsk = ACCESS_ONCE(cpu_rq(cpu)->curr); + +	if (!cpupid_match_pid(tsk, cpupid)) +		goto no_join; + +	grp = rcu_dereference(tsk->numa_group); +	if (!grp) +		goto no_join; + +	my_grp = p->numa_group; +	if (grp == my_grp) +		goto no_join; + +	/* +	 * Only join the other group if its bigger; if we're the bigger group, +	 * the other task will join us. +	 */ +	if (my_grp->nr_tasks > grp->nr_tasks) +		goto no_join; + +	/* +	 * Tie-break on the grp address. +	 */ +	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) +		goto no_join; + +	/* Always join threads in the same process. */ +	if (tsk->mm == current->mm) +		join = true; + +	/* Simple filter to avoid false positives due to PID collisions */ +	if (flags & TNF_SHARED) +		join = true; + +	/* Update priv based on whether false sharing was detected */ +	*priv = !join; + +	if (join && !get_numa_group(grp)) +		goto no_join; + +	rcu_read_unlock(); + +	if (!join) +		return; + +	double_lock(&my_grp->lock, &grp->lock); + +	for (i = 0; i < 2*nr_node_ids; i++) { +		my_grp->faults[i] -= p->numa_faults[i]; +		grp->faults[i] += p->numa_faults[i]; +	} +	my_grp->total_faults -= p->total_numa_faults; +	grp->total_faults += p->total_numa_faults; + +	list_move(&p->numa_entry, &grp->task_list); +	my_grp->nr_tasks--; +	grp->nr_tasks++; + +	spin_unlock(&my_grp->lock); +	spin_unlock(&grp->lock); + +	rcu_assign_pointer(p->numa_group, grp); + +	put_numa_group(my_grp); +	return; + +no_join: +	rcu_read_unlock(); +	return; +} + +void task_numa_free(struct task_struct *p) +{ +	struct numa_group *grp = p->numa_group; +	int i; +	void *numa_faults = p->numa_faults; + +	if (grp) { +		spin_lock(&grp->lock); +		for (i = 0; i < 2*nr_node_ids; i++) +			grp->faults[i] -= p->numa_faults[i]; +		grp->total_faults -= p->total_numa_faults; + +		list_del(&p->numa_entry); +		grp->nr_tasks--; +		spin_unlock(&grp->lock); +		rcu_assign_pointer(p->numa_group, NULL); +		put_numa_group(grp); +	} + +	p->numa_faults = NULL; +	p->numa_faults_buffer = NULL; +	kfree(numa_faults);  }  /*   * Got a PROT_NONE fault for a page on @node.   */ -void task_numa_fault(int node, int pages, bool migrated) +void task_numa_fault(int last_cpupid, int node, int pages, int flags)  {  	struct task_struct *p = current; +	bool migrated = flags & TNF_MIGRATED; +	int priv;  	if (!numabalancing_enabled)  		return; -	/* FIXME: Allocate task-specific structure for placement policy here */ +	/* for example, ksmd faulting in a user's mm */ +	if (!p->mm) +		return; + +	/* Do not worry about placement if exiting */ +	if (p->state == TASK_DEAD) +		return; + +	/* Allocate buffer to track faults on a per-node basis */ +	if (unlikely(!p->numa_faults)) { +		int size = sizeof(*p->numa_faults) * 2 * nr_node_ids; + +		/* numa_faults and numa_faults_buffer share the allocation */ +		p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); +		if (!p->numa_faults) +			return; + +		BUG_ON(p->numa_faults_buffer); +		p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); +		p->total_numa_faults = 0; +		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); +	}  	/* -	 * If pages are properly placed (did not migrate) then scan slower. -	 * This is reset periodically in case of phase changes +	 * First accesses are treated as private, otherwise consider accesses +	 * to be private if the accessing pid has not changed  	 */ -        if (!migrated) -		p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, -			p->numa_scan_period + jiffies_to_msecs(10)); +	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { +		priv = 1; +	} else { +		priv = cpupid_match_pid(p, last_cpupid); +		if (!priv && !(flags & TNF_NO_GROUP)) +			task_numa_group(p, last_cpupid, flags, &priv); +	}  	task_numa_placement(p); + +	/* +	 * Retry task to preferred node migration periodically, in case it +	 * case it previously failed, or the scheduler moved us. +	 */ +	if (time_after(jiffies, p->numa_migrate_retry)) +		numa_migrate_preferred(p); + +	if (migrated) +		p->numa_pages_migrated += pages; + +	p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; +	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;  }  static void reset_ptenuma_scan(struct task_struct *p) @@ -884,6 +1681,7 @@ void task_numa_work(struct callback_head *work)  	struct mm_struct *mm = p->mm;  	struct vm_area_struct *vma;  	unsigned long start, end; +	unsigned long nr_pte_updates = 0;  	long pages;  	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); @@ -900,35 +1698,9 @@ void task_numa_work(struct callback_head *work)  	if (p->flags & PF_EXITING)  		return; -	/* -	 * We do not care about task placement until a task runs on a node -	 * other than the first one used by the address space. This is -	 * largely because migrations are driven by what CPU the task -	 * is running on. If it's never scheduled on another node, it'll -	 * not migrate so why bother trapping the fault. -	 */ -	if (mm->first_nid == NUMA_PTE_SCAN_INIT) -		mm->first_nid = numa_node_id(); -	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { -		/* Are we running on a new node yet? */ -		if (numa_node_id() == mm->first_nid && -		    !sched_feat_numa(NUMA_FORCE)) -			return; - -		mm->first_nid = NUMA_PTE_SCAN_ACTIVE; -	} - -	/* -	 * Reset the scan period if enough time has gone by. Objective is that -	 * scanning will be reduced if pages are properly placed. As tasks -	 * can enter different phases this needs to be re-examined. Lacking -	 * proper tracking of reference behaviour, this blunt hammer is used. -	 */ -	migrate = mm->numa_next_reset; -	if (time_after(now, migrate)) { -		p->numa_scan_period = sysctl_numa_balancing_scan_period_min; -		next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); -		xchg(&mm->numa_next_reset, next_scan); +	if (!mm->numa_next_scan) { +		mm->numa_next_scan = now + +			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);  	}  	/* @@ -938,20 +1710,20 @@ void task_numa_work(struct callback_head *work)  	if (time_before(now, migrate))  		return; -	if (p->numa_scan_period == 0) -		p->numa_scan_period = sysctl_numa_balancing_scan_period_min; +	if (p->numa_scan_period == 0) { +		p->numa_scan_period_max = task_scan_max(p); +		p->numa_scan_period = task_scan_min(p); +	}  	next_scan = now + msecs_to_jiffies(p->numa_scan_period);  	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)  		return;  	/* -	 * Do not set pte_numa if the current running node is rate-limited. -	 * This loses statistics on the fault but if we are unwilling to -	 * migrate to this node, it is less likely we can do useful work +	 * Delay this task enough that another task of this mm will likely win +	 * the next time around.  	 */ -	if (migrate_ratelimited(numa_node_id())) -		return; +	p->node_stamp += 2 * TICK_NSEC;  	start = mm->numa_scan_offset;  	pages = sysctl_numa_balancing_scan_size; @@ -967,18 +1739,32 @@ void task_numa_work(struct callback_head *work)  		vma = mm->mmap;  	}  	for (; vma; vma = vma->vm_next) { -		if (!vma_migratable(vma)) +		if (!vma_migratable(vma) || !vma_policy_mof(p, vma))  			continue; -		/* Skip small VMAs. They are not likely to be of relevance */ -		if (vma->vm_end - vma->vm_start < HPAGE_SIZE) +		/* +		 * Shared library pages mapped by multiple processes are not +		 * migrated as it is expected they are cache replicated. Avoid +		 * hinting faults in read-only file-backed mappings or the vdso +		 * as migrating the pages will be of marginal benefit. +		 */ +		if (!vma->vm_mm || +		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))  			continue;  		do {  			start = max(start, vma->vm_start);  			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);  			end = min(end, vma->vm_end); -			pages -= change_prot_numa(vma, start, end); +			nr_pte_updates += change_prot_numa(vma, start, end); + +			/* +			 * Scan sysctl_numa_balancing_scan_size but ensure that +			 * at least one PTE is updated so that unused virtual +			 * address space is quickly skipped. +			 */ +			if (nr_pte_updates) +				pages -= (end - start) >> PAGE_SHIFT;  			start = end;  			if (pages <= 0) @@ -988,10 +1774,10 @@ void task_numa_work(struct callback_head *work)  out:  	/* -	 * It is possible to reach the end of the VMA list but the last few VMAs are -	 * not guaranteed to the vma_migratable. If they are not, we would find the -	 * !migratable VMA on the next scan but not reset the scanner to the start -	 * so check it now. +	 * It is possible to reach the end of the VMA list but the last few +	 * VMAs are not guaranteed to the vma_migratable. If they are not, we +	 * would find the !migratable VMA on the next scan but not reset the +	 * scanner to the start so check it now.  	 */  	if (vma)  		mm->numa_scan_offset = start; @@ -1025,8 +1811,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)  	if (now - curr->node_stamp > period) {  		if (!curr->node_stamp) -			curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; -		curr->node_stamp = now; +			curr->numa_scan_period = task_scan_min(curr); +		curr->node_stamp += period;  		if (!time_before(jiffies, curr->mm->numa_next_scan)) {  			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ @@ -1038,6 +1824,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)  static void task_tick_numa(struct rq *rq, struct task_struct *curr)  {  } + +static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ +} + +static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ +}  #endif /* CONFIG_NUMA_BALANCING */  static void @@ -1047,8 +1841,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)  	if (!parent_entity(se))  		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);  #ifdef CONFIG_SMP -	if (entity_is_task(se)) -		list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); +	if (entity_is_task(se)) { +		struct rq *rq = rq_of(cfs_rq); + +		account_numa_enqueue(rq, task_of(se)); +		list_add(&se->group_node, &rq->cfs_tasks); +	}  #endif  	cfs_rq->nr_running++;  } @@ -1059,8 +1857,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)  	update_load_sub(&cfs_rq->load, se->load.weight);  	if (!parent_entity(se))  		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); -	if (entity_is_task(se)) +	if (entity_is_task(se)) { +		account_numa_dequeue(rq_of(cfs_rq), task_of(se));  		list_del_init(&se->group_node); +	}  	cfs_rq->nr_running--;  } @@ -1378,7 +2178,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,  	long contrib;  	/* The fraction of a cpu used by this cfs_rq */ -	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, +	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,  			  sa->runnable_avg_period + 1);  	contrib -= cfs_rq->tg_runnable_contrib; @@ -2070,13 +2870,14 @@ static inline bool cfs_bandwidth_used(void)  	return static_key_false(&__cfs_bandwidth_used);  } -void account_cfs_bandwidth_used(int enabled, int was_enabled) +void cfs_bandwidth_usage_inc(void) +{ +	static_key_slow_inc(&__cfs_bandwidth_used); +} + +void cfs_bandwidth_usage_dec(void)  { -	/* only need to count groups transitioning between enabled/!enabled */ -	if (enabled && !was_enabled) -		static_key_slow_inc(&__cfs_bandwidth_used); -	else if (!enabled && was_enabled) -		static_key_slow_dec(&__cfs_bandwidth_used); +	static_key_slow_dec(&__cfs_bandwidth_used);  }  #else /* HAVE_JUMP_LABEL */  static bool cfs_bandwidth_used(void) @@ -2084,7 +2885,8 @@ static bool cfs_bandwidth_used(void)  	return true;  } -void account_cfs_bandwidth_used(int enabled, int was_enabled) {} +void cfs_bandwidth_usage_inc(void) {} +void cfs_bandwidth_usage_dec(void) {}  #endif /* HAVE_JUMP_LABEL */  /* @@ -2335,6 +3137,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  	cfs_rq->throttled_clock = rq_clock(rq);  	raw_spin_lock(&cfs_b->lock);  	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); +	if (!cfs_b->timer_active) +		__start_cfs_bandwidth(cfs_b);  	raw_spin_unlock(&cfs_b->lock);  } @@ -2448,6 +3252,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)  	if (idle)  		goto out_unlock; +	/* +	 * if we have relooped after returning idle once, we need to update our +	 * status as actually running, so that other cpus doing +	 * __start_cfs_bandwidth will stop trying to cancel us. +	 */ +	cfs_b->timer_active = 1; +  	__refill_cfs_bandwidth_runtime(cfs_b);  	if (!throttled) { @@ -2508,7 +3319,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;  /* how long we wait to gather additional slack before distributing */  static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; -/* are we near the end of the current quota period? */ +/* + * Are we near the end of the current quota period? + * + * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the + * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of + * migrate_hrtimers, base is never cleared, so we are fine. + */  static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)  {  	struct hrtimer *refresh_timer = &cfs_b->period_timer; @@ -2584,10 +3401,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  	u64 expires;  	/* confirm we're still not at a refresh boundary */ -	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) +	raw_spin_lock(&cfs_b->lock); +	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { +		raw_spin_unlock(&cfs_b->lock);  		return; +	} -	raw_spin_lock(&cfs_b->lock);  	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {  		runtime = cfs_b->runtime;  		cfs_b->runtime = 0; @@ -2708,11 +3527,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)  	 * (timer_active==0 becomes visible before the hrtimer call-back  	 * terminates).  In either case we ensure that it's re-programmed  	 */ -	while (unlikely(hrtimer_active(&cfs_b->period_timer))) { +	while (unlikely(hrtimer_active(&cfs_b->period_timer)) && +	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { +		/* bounce the lock to allow do_sched_cfs_period_timer to run */  		raw_spin_unlock(&cfs_b->lock); -		/* ensure cfs_b->lock is available while we wait */ -		hrtimer_cancel(&cfs_b->period_timer); - +		cpu_relax();  		raw_spin_lock(&cfs_b->lock);  		/* if someone else restarted the timer then we're done */  		if (cfs_b->timer_active) @@ -3113,7 +3932,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)  {  	struct sched_entity *se = tg->se[cpu]; -	if (!tg->parent)	/* the trivial, non-cgroup case */ +	if (!tg->parent || !wl)	/* the trivial, non-cgroup case */  		return wl;  	for_each_sched_entity(se) { @@ -3166,8 +3985,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)  }  #else -static inline unsigned long effective_load(struct task_group *tg, int cpu, -		unsigned long wl, unsigned long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg)  {  	return wl;  } @@ -3420,11 +4238,10 @@ done:   * preempt must be disabled.   */  static int -select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)  {  	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;  	int cpu = smp_processor_id(); -	int prev_cpu = task_cpu(p);  	int new_cpu = cpu;  	int want_affine = 0;  	int sync = wake_flags & WF_SYNC; @@ -3904,9 +4721,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp  static unsigned long __read_mostly max_load_balance_interval = HZ/10; +enum fbq_type { regular, remote, all }; +  #define LBF_ALL_PINNED	0x01  #define LBF_NEED_BREAK	0x02 -#define LBF_SOME_PINNED 0x04 +#define LBF_DST_PINNED  0x04 +#define LBF_SOME_PINNED	0x08  struct lb_env {  	struct sched_domain	*sd; @@ -3929,6 +4749,8 @@ struct lb_env {  	unsigned int		loop;  	unsigned int		loop_break;  	unsigned int		loop_max; + +	enum fbq_type		fbq_type;  };  /* @@ -3975,6 +4797,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)  	return delta < (s64)sysctl_sched_migration_cost;  } +#ifdef CONFIG_NUMA_BALANCING +/* Returns true if the destination node has incurred more faults */ +static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) +{ +	int src_nid, dst_nid; + +	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults || +	    !(env->sd->flags & SD_NUMA)) { +		return false; +	} + +	src_nid = cpu_to_node(env->src_cpu); +	dst_nid = cpu_to_node(env->dst_cpu); + +	if (src_nid == dst_nid) +		return false; + +	/* Always encourage migration to the preferred node. */ +	if (dst_nid == p->numa_preferred_nid) +		return true; + +	/* If both task and group weight improve, this move is a winner. */ +	if (task_weight(p, dst_nid) > task_weight(p, src_nid) && +	    group_weight(p, dst_nid) > group_weight(p, src_nid)) +		return true; + +	return false; +} + + +static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) +{ +	int src_nid, dst_nid; + +	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) +		return false; + +	if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) +		return false; + +	src_nid = cpu_to_node(env->src_cpu); +	dst_nid = cpu_to_node(env->dst_cpu); + +	if (src_nid == dst_nid) +		return false; + +	/* Migrating away from the preferred node is always bad. */ +	if (src_nid == p->numa_preferred_nid) +		return true; + +	/* If either task or group weight get worse, don't do it. */ +	if (task_weight(p, dst_nid) < task_weight(p, src_nid) || +	    group_weight(p, dst_nid) < group_weight(p, src_nid)) +		return true; + +	return false; +} + +#else +static inline bool migrate_improves_locality(struct task_struct *p, +					     struct lb_env *env) +{ +	return false; +} + +static inline bool migrate_degrades_locality(struct task_struct *p, +					     struct lb_env *env) +{ +	return false; +} +#endif +  /*   * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?   */ @@ -3997,6 +4891,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  		schedstat_inc(p, se.statistics.nr_failed_migrations_affine); +		env->flags |= LBF_SOME_PINNED; +  		/*  		 * Remember if this task can be migrated to any other cpu in  		 * our sched_group. We may want to revisit it if we couldn't @@ -4005,13 +4901,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  		 * Also avoid computing new_dst_cpu if we have already computed  		 * one in current iteration.  		 */ -		if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) +		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))  			return 0;  		/* Prevent to re-select dst_cpu via env's cpus */  		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {  			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { -				env->flags |= LBF_SOME_PINNED; +				env->flags |= LBF_DST_PINNED;  				env->new_dst_cpu = cpu;  				break;  			} @@ -4030,11 +4926,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  	/*  	 * Aggressive migration if: -	 * 1) task is cache cold, or -	 * 2) too many balance attempts have failed. +	 * 1) destination numa is preferred +	 * 2) task is cache cold, or +	 * 3) too many balance attempts have failed.  	 */ -  	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); +	if (!tsk_cache_hot) +		tsk_cache_hot = migrate_degrades_locality(p, env); + +	if (migrate_improves_locality(p, env)) { +#ifdef CONFIG_SCHEDSTATS +		if (tsk_cache_hot) { +			schedstat_inc(env->sd, lb_hot_gained[env->idle]); +			schedstat_inc(p, se.statistics.nr_forced_migrations); +		} +#endif +		return 1; +	} +  	if (!tsk_cache_hot ||  		env->sd->nr_balance_failed > env->sd->cache_nice_tries) { @@ -4077,8 +4986,6 @@ static int move_one_task(struct lb_env *env)  	return 0;  } -static unsigned long task_h_load(struct task_struct *p); -  static const unsigned int sched_nr_migrate_break = 32;  /* @@ -4291,6 +5198,10 @@ struct sg_lb_stats {  	unsigned int group_weight;  	int group_imb; /* Is there an imbalance in the group ? */  	int group_has_capacity; /* Is there extra capacity in the group? */ +#ifdef CONFIG_NUMA_BALANCING +	unsigned int nr_numa_running; +	unsigned int nr_preferred_running; +#endif  };  /* @@ -4330,7 +5241,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)  /**   * get_sd_load_idx - Obtain the load index for a given sched domain.   * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + * @idle: The idle status of the CPU for whose sd load_idx is obtained.   *   * Return: The load index.   */ @@ -4447,7 +5358,7 @@ void update_group_power(struct sched_domain *sd, int cpu)  {  	struct sched_domain *child = sd->child;  	struct sched_group *group, *sdg = sd->groups; -	unsigned long power; +	unsigned long power, power_orig;  	unsigned long interval;  	interval = msecs_to_jiffies(sd->balance_interval); @@ -4459,7 +5370,7 @@ void update_group_power(struct sched_domain *sd, int cpu)  		return;  	} -	power = 0; +	power_orig = power = 0;  	if (child->flags & SD_OVERLAP) {  		/* @@ -4467,8 +5378,12 @@ void update_group_power(struct sched_domain *sd, int cpu)  		 * span the current group.  		 */ -		for_each_cpu(cpu, sched_group_cpus(sdg)) -			power += power_of(cpu); +		for_each_cpu(cpu, sched_group_cpus(sdg)) { +			struct sched_group *sg = cpu_rq(cpu)->sd->groups; + +			power_orig += sg->sgp->power_orig; +			power += sg->sgp->power; +		}  	} else  {  		/*  		 * !SD_OVERLAP domains can assume that child groups @@ -4477,12 +5392,14 @@ void update_group_power(struct sched_domain *sd, int cpu)  		group = child->groups;  		do { +			power_orig += group->sgp->power_orig;  			power += group->sgp->power;  			group = group->next;  		} while (group != child->groups);  	} -	sdg->sgp->power_orig = sdg->sgp->power = power; +	sdg->sgp->power_orig = power_orig; +	sdg->sgp->power = power;  }  /* @@ -4526,13 +5443,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)   * cpu 3 and leave one of the cpus in the second group unused.   *   * The current solution to this issue is detecting the skew in the first group - * by noticing it has a cpu that is overloaded while the remaining cpus are - * idle -- or rather, there's a distinct imbalance in the cpus; see - * sg_imbalanced(). + * by noticing the lower domain failed to reach balance and had difficulty + * moving tasks due to affinity constraints.   *   * When this is so detected; this group becomes a candidate for busiest; see - * update_sd_pick_busiest(). And calculcate_imbalance() and - * find_busiest_group() avoid some of the usual balance conditional to allow it + * update_sd_pick_busiest(). And calculate_imbalance() and + * find_busiest_group() avoid some of the usual balance conditions to allow it   * to create an effective group imbalance.   *   * This is a somewhat tricky proposition since the next run might not find the @@ -4540,49 +5456,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)   * subtle and fragile situation.   */ -struct sg_imb_stats { -	unsigned long max_nr_running, min_nr_running; -	unsigned long max_cpu_load, min_cpu_load; -}; - -static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) +static inline int sg_imbalanced(struct sched_group *group)  { -	sgi->max_cpu_load = sgi->max_nr_running = 0UL; -	sgi->min_cpu_load = sgi->min_nr_running = ~0UL; +	return group->sgp->imbalance;  } -static inline void -update_sg_imb_stats(struct sg_imb_stats *sgi, -		    unsigned long load, unsigned long nr_running) +/* + * Compute the group capacity. + * + * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by + * first dividing out the smt factor and computing the actual number of cores + * and limit power unit capacity with that. + */ +static inline int sg_capacity(struct lb_env *env, struct sched_group *group)  { -	if (load > sgi->max_cpu_load) -		sgi->max_cpu_load = load; -	if (sgi->min_cpu_load > load) -		sgi->min_cpu_load = load; +	unsigned int capacity, smt, cpus; +	unsigned int power, power_orig; -	if (nr_running > sgi->max_nr_running) -		sgi->max_nr_running = nr_running; -	if (sgi->min_nr_running > nr_running) -		sgi->min_nr_running = nr_running; -} +	power = group->sgp->power; +	power_orig = group->sgp->power_orig; +	cpus = group->group_weight; -static inline int -sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) -{ -	/* -	 * Consider the group unbalanced when the imbalance is larger -	 * than the average weight of a task. -	 * -	 * APZ: with cgroup the avg task weight can vary wildly and -	 *      might not be a suitable number - should we keep a -	 *      normalized nr_running number somewhere that negates -	 *      the hierarchy? -	 */ -	if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && -	    (sgi->max_nr_running - sgi->min_nr_running) > 1) -		return 1; +	/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */ +	smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig); +	capacity = cpus / smt; /* cores */ -	return 0; +	capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE)); +	if (!capacity) +		capacity = fix_small_capacity(env->sd, group); + +	return capacity;  }  /** @@ -4597,12 +5500,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,  			struct sched_group *group, int load_idx,  			int local_group, struct sg_lb_stats *sgs)  { -	struct sg_imb_stats sgi;  	unsigned long nr_running;  	unsigned long load;  	int i; -	init_sg_imb_stats(&sgi); +	memset(sgs, 0, sizeof(*sgs));  	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {  		struct rq *rq = cpu_rq(i); @@ -4610,24 +5512,22 @@ static inline void update_sg_lb_stats(struct lb_env *env,  		nr_running = rq->nr_running;  		/* Bias balancing toward cpus of our domain */ -		if (local_group) { +		if (local_group)  			load = target_load(i, load_idx); -		} else { +		else  			load = source_load(i, load_idx); -			update_sg_imb_stats(&sgi, load, nr_running); -		}  		sgs->group_load += load;  		sgs->sum_nr_running += nr_running; +#ifdef CONFIG_NUMA_BALANCING +		sgs->nr_numa_running += rq->nr_numa_running; +		sgs->nr_preferred_running += rq->nr_preferred_running; +#endif  		sgs->sum_weighted_load += weighted_cpuload(i);  		if (idle_cpu(i))  			sgs->idle_cpus++;  	} -	if (local_group && (env->idle != CPU_NEWLY_IDLE || -			time_after_eq(jiffies, group->sgp->next_update))) -		update_group_power(env->sd, env->dst_cpu); -  	/* Adjust by relative CPU power of the group */  	sgs->group_power = group->sgp->power;  	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; @@ -4635,16 +5535,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,  	if (sgs->sum_nr_running)  		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; -	sgs->group_imb = sg_imbalanced(sgs, &sgi); - -	sgs->group_capacity = -		DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); - -	if (!sgs->group_capacity) -		sgs->group_capacity = fix_small_capacity(env->sd, group); -  	sgs->group_weight = group->group_weight; +	sgs->group_imb = sg_imbalanced(group); +	sgs->group_capacity = sg_capacity(env, group); +  	if (sgs->group_capacity > sgs->sum_nr_running)  		sgs->group_has_capacity = 1;  } @@ -4693,14 +5588,42 @@ static bool update_sd_pick_busiest(struct lb_env *env,  	return false;  } +#ifdef CONFIG_NUMA_BALANCING +static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) +{ +	if (sgs->sum_nr_running > sgs->nr_numa_running) +		return regular; +	if (sgs->sum_nr_running > sgs->nr_preferred_running) +		return remote; +	return all; +} + +static inline enum fbq_type fbq_classify_rq(struct rq *rq) +{ +	if (rq->nr_running > rq->nr_numa_running) +		return regular; +	if (rq->nr_running > rq->nr_preferred_running) +		return remote; +	return all; +} +#else +static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) +{ +	return all; +} + +static inline enum fbq_type fbq_classify_rq(struct rq *rq) +{ +	return regular; +} +#endif /* CONFIG_NUMA_BALANCING */ +  /**   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.   * @env: The load balancing environment. - * @balance: Should we balance.   * @sds: variable to hold the statistics for this sched_domain.   */ -static inline void update_sd_lb_stats(struct lb_env *env, -					struct sd_lb_stats *sds) +static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)  {  	struct sched_domain *child = env->sd->child;  	struct sched_group *sg = env->sd->groups; @@ -4720,11 +5643,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,  		if (local_group) {  			sds->local = sg;  			sgs = &sds->local_stat; + +			if (env->idle != CPU_NEWLY_IDLE || +			    time_after_eq(jiffies, sg->sgp->next_update)) +				update_group_power(env->sd, env->dst_cpu);  		} -		memset(sgs, 0, sizeof(*sgs));  		update_sg_lb_stats(env, sg, load_idx, local_group, sgs); +		if (local_group) +			goto next_group; +  		/*  		 * In case the child domain prefers tasks go to siblings  		 * first, lower the sg capacity to one so that we'll try @@ -4735,21 +5664,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,  		 * heaviest group when it is already under-utilized (possible  		 * with a large weight task outweighs the tasks on the system).  		 */ -		if (prefer_sibling && !local_group && -				sds->local && sds->local_stat.group_has_capacity) +		if (prefer_sibling && sds->local && +		    sds->local_stat.group_has_capacity)  			sgs->group_capacity = min(sgs->group_capacity, 1U); -		/* Now, start updating sd_lb_stats */ -		sds->total_load += sgs->group_load; -		sds->total_pwr += sgs->group_power; - -		if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { +		if (update_sd_pick_busiest(env, sds, sg, sgs)) {  			sds->busiest = sg;  			sds->busiest_stat = *sgs;  		} +next_group: +		/* Now, start updating sd_lb_stats */ +		sds->total_load += sgs->group_load; +		sds->total_pwr += sgs->group_power; +  		sg = sg->next;  	} while (sg != env->sd->groups); + +	if (env->sd->flags & SD_NUMA) +		env->fbq_type = fbq_classify_group(&sds->busiest_stat);  }  /** @@ -5053,15 +5986,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,  	int i;  	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { -		unsigned long power = power_of(i); -		unsigned long capacity = DIV_ROUND_CLOSEST(power, -							   SCHED_POWER_SCALE); -		unsigned long wl; +		unsigned long power, capacity, wl; +		enum fbq_type rt; + +		rq = cpu_rq(i); +		rt = fbq_classify_rq(rq); + +		/* +		 * We classify groups/runqueues into three groups: +		 *  - regular: there are !numa tasks +		 *  - remote:  there are numa tasks that run on the 'wrong' node +		 *  - all:     there is no distinction +		 * +		 * In order to avoid migrating ideally placed numa tasks, +		 * ignore those when there's better options. +		 * +		 * If we ignore the actual busiest queue to migrate another +		 * task, the next balance pass can still reduce the busiest +		 * queue by moving tasks around inside the node. +		 * +		 * If we cannot move enough load due to this classification +		 * the next pass will adjust the group classification and +		 * allow migration of more tasks. +		 * +		 * Both cases only affect the total convergence complexity. +		 */ +		if (rt > env->fbq_type) +			continue; +		power = power_of(i); +		capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);  		if (!capacity)  			capacity = fix_small_capacity(env->sd, group); -		rq = cpu_rq(i);  		wl = weighted_cpuload(i);  		/* @@ -5164,6 +6121,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,  			int *continue_balancing)  {  	int ld_moved, cur_ld_moved, active_balance = 0; +	struct sched_domain *sd_parent = sd->parent;  	struct sched_group *group;  	struct rq *busiest;  	unsigned long flags; @@ -5177,6 +6135,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,  		.idle		= idle,  		.loop_break	= sched_nr_migrate_break,  		.cpus		= cpus, +		.fbq_type	= all,  	};  	/* @@ -5268,17 +6227,17 @@ more_balance:  		 * moreover subsequent load balance cycles should correct the  		 * excess load moved.  		 */ -		if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { +		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { + +			/* Prevent to re-select dst_cpu via env's cpus */ +			cpumask_clear_cpu(env.dst_cpu, env.cpus);  			env.dst_rq	 = cpu_rq(env.new_dst_cpu);  			env.dst_cpu	 = env.new_dst_cpu; -			env.flags	&= ~LBF_SOME_PINNED; +			env.flags	&= ~LBF_DST_PINNED;  			env.loop	 = 0;  			env.loop_break	 = sched_nr_migrate_break; -			/* Prevent to re-select dst_cpu via env's cpus */ -			cpumask_clear_cpu(env.dst_cpu, env.cpus); -  			/*  			 * Go back to "more_balance" rather than "redo" since we  			 * need to continue with same src_cpu. @@ -5286,6 +6245,18 @@ more_balance:  			goto more_balance;  		} +		/* +		 * We failed to reach balance because of affinity. +		 */ +		if (sd_parent) { +			int *group_imbalance = &sd_parent->groups->sgp->imbalance; + +			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { +				*group_imbalance = 1; +			} else if (*group_imbalance) +				*group_imbalance = 0; +		} +  		/* All tasks on this runqueue were pinned by CPU affinity */  		if (unlikely(env.flags & LBF_ALL_PINNED)) {  			cpumask_clear_cpu(cpu_of(busiest), cpus); @@ -5393,6 +6364,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)  	struct sched_domain *sd;  	int pulled_task = 0;  	unsigned long next_balance = jiffies + HZ; +	u64 curr_cost = 0;  	this_rq->idle_stamp = rq_clock(this_rq); @@ -5409,15 +6381,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)  	for_each_domain(this_cpu, sd) {  		unsigned long interval;  		int continue_balancing = 1; +		u64 t0, domain_cost;  		if (!(sd->flags & SD_LOAD_BALANCE))  			continue; +		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) +			break; +  		if (sd->flags & SD_BALANCE_NEWIDLE) { +			t0 = sched_clock_cpu(this_cpu); +  			/* If we've pulled tasks over stop searching: */  			pulled_task = load_balance(this_cpu, this_rq,  						   sd, CPU_NEWLY_IDLE,  						   &continue_balancing); + +			domain_cost = sched_clock_cpu(this_cpu) - t0; +			if (domain_cost > sd->max_newidle_lb_cost) +				sd->max_newidle_lb_cost = domain_cost; + +			curr_cost += domain_cost;  		}  		interval = msecs_to_jiffies(sd->balance_interval); @@ -5439,6 +6423,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)  		 */  		this_rq->next_balance = next_balance;  	} + +	if (curr_cost > this_rq->max_idle_balance_cost) +		this_rq->max_idle_balance_cost = curr_cost;  }  /* @@ -5572,16 +6559,16 @@ static inline void nohz_balance_exit_idle(int cpu)  static inline void set_cpu_sd_state_busy(void)  {  	struct sched_domain *sd; +	int cpu = smp_processor_id();  	rcu_read_lock(); -	sd = rcu_dereference_check_sched_domain(this_rq()->sd); +	sd = rcu_dereference(per_cpu(sd_busy, cpu));  	if (!sd || !sd->nohz_idle)  		goto unlock;  	sd->nohz_idle = 0; -	for (; sd; sd = sd->parent) -		atomic_inc(&sd->groups->sgp->nr_busy_cpus); +	atomic_inc(&sd->groups->sgp->nr_busy_cpus);  unlock:  	rcu_read_unlock();  } @@ -5589,16 +6576,16 @@ unlock:  void set_cpu_sd_state_idle(void)  {  	struct sched_domain *sd; +	int cpu = smp_processor_id();  	rcu_read_lock(); -	sd = rcu_dereference_check_sched_domain(this_rq()->sd); +	sd = rcu_dereference(per_cpu(sd_busy, cpu));  	if (!sd || sd->nohz_idle)  		goto unlock;  	sd->nohz_idle = 1; -	for (; sd; sd = sd->parent) -		atomic_dec(&sd->groups->sgp->nr_busy_cpus); +	atomic_dec(&sd->groups->sgp->nr_busy_cpus);  unlock:  	rcu_read_unlock();  } @@ -5662,15 +6649,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)  	/* Earliest time when we have to do rebalance again */  	unsigned long next_balance = jiffies + 60*HZ;  	int update_next_balance = 0; -	int need_serialize; +	int need_serialize, need_decay = 0; +	u64 max_cost = 0;  	update_blocked_averages(cpu);  	rcu_read_lock();  	for_each_domain(cpu, sd) { +		/* +		 * Decay the newidle max times here because this is a regular +		 * visit to all the domains. Decay ~1% per second. +		 */ +		if (time_after(jiffies, sd->next_decay_max_lb_cost)) { +			sd->max_newidle_lb_cost = +				(sd->max_newidle_lb_cost * 253) / 256; +			sd->next_decay_max_lb_cost = jiffies + HZ; +			need_decay = 1; +		} +		max_cost += sd->max_newidle_lb_cost; +  		if (!(sd->flags & SD_LOAD_BALANCE))  			continue; +		/* +		 * Stop the load balance at this level. There is another +		 * CPU in our sched group which is doing load balancing more +		 * actively. +		 */ +		if (!continue_balancing) { +			if (need_decay) +				continue; +			break; +		} +  		interval = sd->balance_interval;  		if (idle != CPU_IDLE)  			interval *= sd->busy_factor; @@ -5689,7 +6700,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)  		if (time_after_eq(jiffies, sd->last_balance + interval)) {  			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {  				/* -				 * The LBF_SOME_PINNED logic could have changed +				 * The LBF_DST_PINNED logic could have changed  				 * env->dst_cpu, so we can't know our idle  				 * state even if we migrated tasks. Update it.  				 */ @@ -5704,14 +6715,14 @@ out:  			next_balance = sd->last_balance + interval;  			update_next_balance = 1;  		} - +	} +	if (need_decay) {  		/* -		 * Stop the load balance at this level. There is another -		 * CPU in our sched group which is doing load balancing more -		 * actively. +		 * Ensure the rq-wide value also decays but keep it at a +		 * reasonable floor to avoid funnies with rq->avg_idle.  		 */ -		if (!continue_balancing) -			break; +		rq->max_idle_balance_cost = +			max((u64)sysctl_sched_migration_cost, max_cost);  	}  	rcu_read_unlock(); @@ -5781,6 +6792,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)  {  	unsigned long now = jiffies;  	struct sched_domain *sd; +	struct sched_group_power *sgp; +	int nr_busy;  	if (unlikely(idle_cpu(cpu)))  		return 0; @@ -5806,22 +6819,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)  		goto need_kick;  	rcu_read_lock(); -	for_each_domain(cpu, sd) { -		struct sched_group *sg = sd->groups; -		struct sched_group_power *sgp = sg->sgp; -		int nr_busy = atomic_read(&sgp->nr_busy_cpus); +	sd = rcu_dereference(per_cpu(sd_busy, cpu)); -		if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) -			goto need_kick_unlock; +	if (sd) { +		sgp = sd->groups->sgp; +		nr_busy = atomic_read(&sgp->nr_busy_cpus); -		if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight -		    && (cpumask_first_and(nohz.idle_cpus_mask, -					  sched_domain_span(sd)) < cpu)) +		if (nr_busy > 1)  			goto need_kick_unlock; - -		if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) -			break;  	} + +	sd = rcu_dereference(per_cpu(sd_asym, cpu)); + +	if (sd && (cpumask_first_and(nohz.idle_cpus_mask, +				  sched_domain_span(sd)) < cpu)) +		goto need_kick_unlock; +  	rcu_read_unlock();  	return 0; @@ -6214,7 +7227,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,  		se->cfs_rq = parent->my_q;  	se->my_q = cfs_rq; -	update_load_set(&se->load, 0); +	/* guarantee group entities always have weight */ +	update_load_set(&se->load, NICE_0_LOAD);  	se->parent = parent;  } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 99399f8e4799..5716929a2e3a 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)  /*   * Apply the automatic NUMA scheduling policy. Enabled automatically   * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing=. Allow PTE scanning to be forced on UMA machines - * for debugging the core machinery. + * numa_balancing=   */  #ifdef CONFIG_NUMA_BALANCING  SCHED_FEAT(NUMA,	false) -SCHED_FEAT(NUMA_FORCE,	false) + +/* + * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a + * higher number of hinting faults are recorded during active load + * balancing. + */ +SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) + +/* + * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a + * lower number of hinting faults have been recorded. As this has + * the potential to prevent a task ever migrating to a new node + * due to CPU overload it is disabled by default. + */ +SCHED_FEAT(NUMA_RESIST_LOWER, false)  #endif diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index d8da01008d39..516c3d9ceea1 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -9,7 +9,7 @@  #ifdef CONFIG_SMP  static int -select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)  {  	return task_cpu(p); /* IDLE tasks as never migrated */  } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 01970c8e64df..7d57275fc396 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)  	 * if we should look at the mask. It would be a shame  	 * if we looked at the mask, but the mask was not  	 * updated yet. +	 * +	 * Matched by the barrier in pull_rt_task().  	 */ -	wmb(); +	smp_wmb();  	atomic_inc(&rq->rd->rto_count);  } @@ -1169,13 +1171,10 @@ static void yield_task_rt(struct rq *rq)  static int find_lowest_rq(struct task_struct *task);  static int -select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)  {  	struct task_struct *curr;  	struct rq *rq; -	int cpu; - -	cpu = task_cpu(p);  	if (p->nr_cpus_allowed == 1)  		goto out; @@ -1213,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)  	 */  	if (curr && unlikely(rt_task(curr)) &&  	    (curr->nr_cpus_allowed < 2 || -	     curr->prio <= p->prio) && -	    (p->nr_cpus_allowed > 1)) { +	     curr->prio <= p->prio)) {  		int target = find_lowest_rq(p);  		if (target != -1) @@ -1630,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq)  	if (likely(!rt_overloaded(this_rq)))  		return 0; +	/* +	 * Match the barrier from rt_set_overloaded; this guarantees that if we +	 * see overloaded we must also see the rto_mask bit. +	 */ +	smp_rmb(); +  	for_each_cpu(cpu, this_rq->rd->rto_mask) {  		if (this_cpu == cpu)  			continue; @@ -1931,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)  	p->rt.time_slice = sched_rr_timeslice;  	/* -	 * Requeue to the end of queue if we (and all of our ancestors) are the -	 * only element on the queue +	 * Requeue to the end of queue if we (and all of our ancestors) are not +	 * the only element on the queue  	 */  	for_each_sched_rt_entity(rt_se) {  		if (rt_se->run_list.prev != rt_se->run_list.next) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3c5653e1dca..88c85b21d633 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -6,6 +6,7 @@  #include <linux/spinlock.h>  #include <linux/stop_machine.h>  #include <linux/tick.h> +#include <linux/slab.h>  #include "cpupri.h"  #include "cpuacct.h" @@ -408,6 +409,10 @@ struct rq {  	 * remote CPUs use both these fields when doing load calculation.  	 */  	unsigned int nr_running; +#ifdef CONFIG_NUMA_BALANCING +	unsigned int nr_numa_running; +	unsigned int nr_preferred_running; +#endif  	#define CPU_LOAD_IDX_MAX 5  	unsigned long cpu_load[CPU_LOAD_IDX_MAX];  	unsigned long last_load_update_tick; @@ -476,6 +481,9 @@ struct rq {  	u64 age_stamp;  	u64 idle_stamp;  	u64 avg_idle; + +	/* This is used to determine avg_idle's max value */ +	u64 max_idle_balance_cost;  #endif  #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)  	return rq->clock_task;  } +#ifdef CONFIG_NUMA_BALANCING +extern void sched_setnuma(struct task_struct *p, int node); +extern int migrate_task_to(struct task_struct *p, int cpu); +extern int migrate_swap(struct task_struct *, struct task_struct *); +#endif /* CONFIG_NUMA_BALANCING */ +  #ifdef CONFIG_SMP  #define rcu_dereference_check_sched_domain(p) \ @@ -593,9 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)  	return hsd;  } +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ +	struct sched_domain *sd; + +	for_each_domain(cpu, sd) { +		if (sd->flags & flag) +			break; +	} + +	return sd; +} +  DECLARE_PER_CPU(struct sched_domain *, sd_llc);  DECLARE_PER_CPU(int, sd_llc_size);  DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(struct sched_domain *, sd_numa); +DECLARE_PER_CPU(struct sched_domain *, sd_busy); +DECLARE_PER_CPU(struct sched_domain *, sd_asym);  struct sched_group_power {  	atomic_t ref; @@ -605,6 +634,7 @@ struct sched_group_power {  	 */  	unsigned int power, power_orig;  	unsigned long next_update; +	int imbalance; /* XXX unrelated to power but shared group state */  	/*  	 * Number of busy cpus in this group.  	 */ @@ -719,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)  	 */  	smp_wmb();  	task_thread_info(p)->cpu = cpu; +	p->wake_cpu = cpu;  #endif  } @@ -974,7 +1005,7 @@ struct sched_class {  	void (*put_prev_task) (struct rq *rq, struct task_struct *p);  #ifdef CONFIG_SMP -	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); +	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);  	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);  	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); @@ -1220,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)  	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);  } +static inline void double_lock(spinlock_t *l1, spinlock_t *l2) +{ +	if (l1 > l2) +		swap(l1, l2); + +	spin_lock(l1); +	spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} + +static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ +	if (l1 > l2) +		swap(l1, l2); + +	raw_spin_lock(l1); +	raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); +} +  /*   * double_rq_lock - safely lock two runqueues   * @@ -1305,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);  extern void init_cfs_rq(struct cfs_rq *cfs_rq);  extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void account_cfs_bandwidth_used(int enabled, int was_enabled); +extern void cfs_bandwidth_usage_inc(void); +extern void cfs_bandwidth_usage_dec(void);  #ifdef CONFIG_NO_HZ_COMMON  enum rq_nohz_flag_bits { diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c7edee71bce8..4ab704339656 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)   * from dequeue_task() to account for possible rq->clock skew across cpus. The   * delta taken on each cpu would annul the skew.   */ -static inline void sched_info_dequeued(struct task_struct *t) +static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)  { -	unsigned long long now = rq_clock(task_rq(t)), delta = 0; +	unsigned long long now = rq_clock(rq), delta = 0;  	if (unlikely(sched_info_on()))  		if (t->sched_info.last_queued) @@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)  	sched_info_reset_dequeued(t);  	t->sched_info.run_delay += delta; -	rq_sched_info_dequeued(task_rq(t), delta); +	rq_sched_info_dequeued(rq, delta);  }  /* @@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)   * long it was waiting to run.  We also note when it began so that we   * can keep stats on how long its timeslice is.   */ -static void sched_info_arrive(struct task_struct *t) +static void sched_info_arrive(struct rq *rq, struct task_struct *t)  { -	unsigned long long now = rq_clock(task_rq(t)), delta = 0; +	unsigned long long now = rq_clock(rq), delta = 0;  	if (t->sched_info.last_queued)  		delta = now - t->sched_info.last_queued; @@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)  	t->sched_info.last_arrival = now;  	t->sched_info.pcount++; -	rq_sched_info_arrive(task_rq(t), delta); +	rq_sched_info_arrive(rq, delta);  }  /* @@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t)   * the timestamp if it is already not set.  It's assumed that   * sched_info_dequeued() will clear that stamp when appropriate.   */ -static inline void sched_info_queued(struct task_struct *t) +static inline void sched_info_queued(struct rq *rq, struct task_struct *t)  {  	if (unlikely(sched_info_on()))  		if (!t->sched_info.last_queued) -			t->sched_info.last_queued = rq_clock(task_rq(t)); +			t->sched_info.last_queued = rq_clock(rq);  }  /* @@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t)   * sched_info_queued() to mark that it has now again started waiting on   * the runqueue.   */ -static inline void sched_info_depart(struct task_struct *t) +static inline void sched_info_depart(struct rq *rq, struct task_struct *t)  { -	unsigned long long delta = rq_clock(task_rq(t)) - +	unsigned long long delta = rq_clock(rq) -  					t->sched_info.last_arrival; -	rq_sched_info_depart(task_rq(t), delta); +	rq_sched_info_depart(rq, delta);  	if (t->state == TASK_RUNNING) -		sched_info_queued(t); +		sched_info_queued(rq, t);  }  /* @@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)   * the idle task.)  We are only called when prev != next.   */  static inline void -__sched_info_switch(struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct rq *rq, +		    struct task_struct *prev, struct task_struct *next)  { -	struct rq *rq = task_rq(prev); -  	/*  	 * prev now departs the cpu.  It's not interesting to record  	 * stats about how efficient we were at scheduling the idle  	 * process, however.  	 */  	if (prev != rq->idle) -		sched_info_depart(prev); +		sched_info_depart(rq, prev);  	if (next != rq->idle) -		sched_info_arrive(next); +		sched_info_arrive(rq, next);  }  static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) +sched_info_switch(struct rq *rq, +		  struct task_struct *prev, struct task_struct *next)  {  	if (unlikely(sched_info_on())) -		__sched_info_switch(prev, next); +		__sched_info_switch(rq, prev, next);  }  #else -#define sched_info_queued(t)			do { } while (0) +#define sched_info_queued(rq, t)		do { } while (0)  #define sched_info_reset_dequeued(t)	do { } while (0) -#define sched_info_dequeued(t)			do { } while (0) -#define sched_info_switch(t, next)		do { } while (0) +#define sched_info_dequeued(rq, t)		do { } while (0) +#define sched_info_depart(rq, t)		do { } while (0) +#define sched_info_arrive(rq, next)		do { } while (0) +#define sched_info_switch(rq, t, next)		do { } while (0)  #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */  /* diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index e08fbeeb54b9..47197de8abd9 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -11,7 +11,7 @@  #ifdef CONFIG_SMP  static int -select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)  {  	return task_cpu(p); /* stop tasks as never migrate */  } diff --git a/kernel/wait.c b/kernel/sched/wait.c index d550920e040c..7d50f794e248 100644 --- a/kernel/wait.c +++ b/kernel/sched/wait.c @@ -53,6 +53,109 @@ EXPORT_SYMBOL(remove_wait_queue);  /* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, int wake_flags, void *key) +{ +	wait_queue_t *curr, *next; + +	list_for_each_entry_safe(curr, next, &q->task_list, task_list) { +		unsigned flags = curr->flags; + +		if (curr->func(curr, mode, wake_flags, key) && +				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) +			break; +	} +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, void *key) +{ +	unsigned long flags; + +	spin_lock_irqsave(&q->lock, flags); +	__wake_up_common(q, mode, nr_exclusive, 0, key); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) +{ +	__wake_up_common(q, mode, nr, 0, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked); + +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ +	__wake_up_common(q, mode, 1, 0, key); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key); + +/** + * __wake_up_sync_key - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, +			int nr_exclusive, void *key) +{ +	unsigned long flags; +	int wake_flags = 1; /* XXX WF_SYNC */ + +	if (unlikely(!q)) +		return; + +	if (unlikely(nr_exclusive != 1)) +		wake_flags = 0; + +	spin_lock_irqsave(&q->lock, flags); +	__wake_up_common(q, mode, nr_exclusive, wake_flags, key); +	spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ +	__wake_up_sync_key(q, mode, nr_exclusive, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */ + +/*   * Note: we use "set_current_state()" _after_ the wait-queue add,   * because we need a memory barrier there on SMP, so that any   * wake-function that tests for the wait-queue being active @@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)  }  EXPORT_SYMBOL(prepare_to_wait_exclusive); +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ +	unsigned long flags; + +	if (signal_pending_state(state, current)) +		return -ERESTARTSYS; + +	wait->private = current; +	wait->func = autoremove_wake_function; + +	spin_lock_irqsave(&q->lock, flags); +	if (list_empty(&wait->task_list)) { +		if (wait->flags & WQ_FLAG_EXCLUSIVE) +			__add_wait_queue_tail(q, wait); +		else +			__add_wait_queue(q, wait); +	} +	set_current_state(state); +	spin_unlock_irqrestore(&q->lock, flags); + +	return 0; +} +EXPORT_SYMBOL(prepare_to_wait_event); +  /**   * finish_wait - clean up after waiting in a queue   * @q: waitqueue waited on diff --git a/kernel/signal.c b/kernel/signal.c index ded28b91fa53..940b30ee9a30 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2723,7 +2723,7 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,  #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER -int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) +int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)  {  	int err; diff --git a/kernel/smp.c b/kernel/smp.c index 0564571dcdf7..bd9f94028838 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -15,9 +15,9 @@  #include "smpboot.h" -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS  enum {  	CSD_FLAG_LOCK		= 0x01, +	CSD_FLAG_WAIT		= 0x02,  };  struct call_function_data { @@ -124,7 +124,7 @@ static void csd_lock(struct call_single_data *csd)  static void csd_unlock(struct call_single_data *csd)  { -	WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); +	WARN_ON((csd->flags & CSD_FLAG_WAIT) && !(csd->flags & CSD_FLAG_LOCK));  	/*  	 * ensure we're all done before releasing data: @@ -139,13 +139,15 @@ static void csd_unlock(struct call_single_data *csd)   * for execution on the given CPU. data must already have   * ->func, ->info, and ->flags set.   */ -static -void generic_exec_single(int cpu, struct call_single_data *csd, int wait) +static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)  {  	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);  	unsigned long flags;  	int ipi; +	if (wait) +		csd->flags |= CSD_FLAG_WAIT; +  	raw_spin_lock_irqsave(&dst->lock, flags);  	ipi = list_empty(&dst->list);  	list_add_tail(&csd->list, &dst->list); @@ -340,6 +342,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd,  	}  	put_cpu();  } +EXPORT_SYMBOL_GPL(__smp_call_function_single);  /**   * smp_call_function_many(): Run a function on a set of other CPUs. @@ -459,7 +462,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait)  	return 0;  }  EXPORT_SYMBOL(smp_call_function); -#endif /* USE_GENERIC_SMP_HELPERS */  /* Setup configured maximum number of CPUs to activate */  unsigned int setup_max_cpus = NR_CPUS; @@ -524,6 +526,11 @@ void __init setup_nr_cpu_ids(void)  	nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;  } +void __weak smp_announce(void) +{ +	printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus()); +} +  /* Called by boot processor to activate the rest. */  void __init smp_init(void)  { @@ -540,7 +547,7 @@ void __init smp_init(void)  	}  	/* Any cleanup work */ -	printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); +	smp_announce();  	smp_cpus_done(setup_max_cpus);  } diff --git a/kernel/softirq.c b/kernel/softirq.c index d7d498d8cc4f..11025ccc06dd 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,8 +6,6 @@   *	Distribute under GPLv2.   *   *	Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) - * - *	Remote softirq infrastructure is by Jens Axboe.   */  #include <linux/export.h> @@ -29,7 +27,6 @@  #define CREATE_TRACE_POINTS  #include <trace/events/irq.h> -#include <asm/irq.h>  /*     - No shared variables, all the data are CPU local.     - If a softirq needs serialization, let it serialize itself @@ -100,13 +97,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)  	raw_local_irq_save(flags);  	/* -	 * The preempt tracer hooks into add_preempt_count and will break +	 * The preempt tracer hooks into preempt_count_add and will break  	 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET  	 * is set and before current->softirq_enabled is cleared.  	 * We must manually increment preempt_count here and manually  	 * call the trace_preempt_off later.  	 */ -	preempt_count() += cnt; +	__preempt_count_add(cnt);  	/*  	 * Were softirqs turned off above:  	 */ @@ -120,7 +117,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)  #else /* !CONFIG_TRACE_IRQFLAGS */  static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)  { -	add_preempt_count(cnt); +	preempt_count_add(cnt);  	barrier();  }  #endif /* CONFIG_TRACE_IRQFLAGS */ @@ -134,12 +131,11 @@ EXPORT_SYMBOL(local_bh_disable);  static void __local_bh_enable(unsigned int cnt)  { -	WARN_ON_ONCE(in_irq());  	WARN_ON_ONCE(!irqs_disabled());  	if (softirq_count() == cnt)  		trace_softirqs_on(_RET_IP_); -	sub_preempt_count(cnt); +	preempt_count_sub(cnt);  }  /* @@ -149,6 +145,7 @@ static void __local_bh_enable(unsigned int cnt)   */  void _local_bh_enable(void)  { +	WARN_ON_ONCE(in_irq());  	__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);  } @@ -169,12 +166,17 @@ static inline void _local_bh_enable_ip(unsigned long ip)  	 * Keep preemption disabled until we are done with  	 * softirq processing:   	 */ -	sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); +	preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1); -	if (unlikely(!in_interrupt() && local_softirq_pending())) +	if (unlikely(!in_interrupt() && local_softirq_pending())) { +		/* +		 * Run softirq if any pending. And do it in its own stack +		 * as we may be calling this deep in a task call stack already. +		 */  		do_softirq(); +	} -	dec_preempt_count(); +	preempt_count_dec();  #ifdef CONFIG_TRACE_IRQFLAGS  	local_irq_enable();  #endif @@ -256,7 +258,7 @@ restart:  				       " exited with %08x?\n", vec_nr,  				       softirq_to_name[vec_nr], h->action,  				       prev_count, preempt_count()); -				preempt_count() = prev_count; +				preempt_count_set(prev_count);  			}  			rcu_bh_qs(cpu); @@ -280,10 +282,11 @@ restart:  	account_irq_exit_time(current);  	__local_bh_enable(SOFTIRQ_OFFSET); +	WARN_ON_ONCE(in_interrupt());  	tsk_restore_flags(current, old_flags, PF_MEMALLOC);  } -#ifndef __ARCH_HAS_DO_SOFTIRQ +  asmlinkage void do_softirq(void)  { @@ -298,13 +301,11 @@ asmlinkage void do_softirq(void)  	pending = local_softirq_pending();  	if (pending) -		__do_softirq(); +		do_softirq_own_stack();  	local_irq_restore(flags);  } -#endif -  /*   * Enter an interrupt context.   */ @@ -329,15 +330,21 @@ void irq_enter(void)  static inline void invoke_softirq(void)  {  	if (!force_irqthreads) { +#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK  		/*  		 * We can safely execute softirq on the current stack if  		 * it is the irq stack, because it should be near empty -		 * at this stage. But we have no way to know if the arch -		 * calls irq_exit() on the irq stack. So call softirq -		 * in its own stack to prevent from any overrun on top -		 * of a potentially deep task stack. +		 * at this stage.  		 */ -		do_softirq(); +		__do_softirq(); +#else +		/* +		 * Otherwise, irq_exit() is called on the task stack that can +		 * be potentially deep already. So call softirq in its own stack +		 * to prevent from any overrun. +		 */ +		do_softirq_own_stack(); +#endif  	} else {  		wakeup_softirqd();  	} @@ -369,7 +376,7 @@ void irq_exit(void)  	account_irq_exit_time(current);  	trace_hardirq_exit(); -	sub_preempt_count(HARDIRQ_OFFSET); +	preempt_count_sub(HARDIRQ_OFFSET);  	if (!in_interrupt() && local_softirq_pending())  		invoke_softirq(); @@ -618,146 +625,17 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,  }  EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); -/* - * Remote softirq bits - */ - -DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); -EXPORT_PER_CPU_SYMBOL(softirq_work_list); - -static void __local_trigger(struct call_single_data *cp, int softirq) -{ -	struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); - -	list_add_tail(&cp->list, head); - -	/* Trigger the softirq only if the list was previously empty.  */ -	if (head->next == &cp->list) -		raise_softirq_irqoff(softirq); -} - -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS -static void remote_softirq_receive(void *data) -{ -	struct call_single_data *cp = data; -	unsigned long flags; -	int softirq; - -	softirq = *(int *)cp->info; -	local_irq_save(flags); -	__local_trigger(cp, softirq); -	local_irq_restore(flags); -} - -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ -	if (cpu_online(cpu)) { -		cp->func = remote_softirq_receive; -		cp->info = &softirq; -		cp->flags = 0; - -		__smp_call_function_single(cpu, cp, 0); -		return 0; -	} -	return 1; -} -#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ -	return 1; -} -#endif - -/** - * __send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @this_cpu: the currently executing cpu - * @softirq: the softirq for the work - * - * Attempt to schedule softirq work on a remote cpu.  If this cannot be - * done, the work is instead queued up on the local cpu. - * - * Interrupts must be disabled. - */ -void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) -{ -	if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) -		__local_trigger(cp, softirq); -} -EXPORT_SYMBOL(__send_remote_softirq); - -/** - * send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @softirq: the softirq for the work - * - * Like __send_remote_softirq except that disabling interrupts and - * computing the current cpu is done for the caller. - */ -void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ -	unsigned long flags; -	int this_cpu; - -	local_irq_save(flags); -	this_cpu = smp_processor_id(); -	__send_remote_softirq(cp, cpu, this_cpu, softirq); -	local_irq_restore(flags); -} -EXPORT_SYMBOL(send_remote_softirq); - -static int remote_softirq_cpu_notify(struct notifier_block *self, -					       unsigned long action, void *hcpu) -{ -	/* -	 * If a CPU goes away, splice its entries to the current CPU -	 * and trigger a run of the softirq -	 */ -	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { -		int cpu = (unsigned long) hcpu; -		int i; - -		local_irq_disable(); -		for (i = 0; i < NR_SOFTIRQS; i++) { -			struct list_head *head = &per_cpu(softirq_work_list[i], cpu); -			struct list_head *local_head; - -			if (list_empty(head)) -				continue; - -			local_head = &__get_cpu_var(softirq_work_list[i]); -			list_splice_init(head, local_head); -			raise_softirq_irqoff(i); -		} -		local_irq_enable(); -	} - -	return NOTIFY_OK; -} - -static struct notifier_block remote_softirq_cpu_notifier = { -	.notifier_call	= remote_softirq_cpu_notify, -}; -  void __init softirq_init(void)  {  	int cpu;  	for_each_possible_cpu(cpu) { -		int i; -  		per_cpu(tasklet_vec, cpu).tail =  			&per_cpu(tasklet_vec, cpu).head;  		per_cpu(tasklet_hi_vec, cpu).tail =  			&per_cpu(tasklet_hi_vec, cpu).head; -		for (i = 0; i < NR_SOFTIRQS; i++) -			INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));  	} -	register_hotcpu_notifier(&remote_softirq_cpu_notifier); -  	open_softirq(TASKLET_SOFTIRQ, tasklet_action);  	open_softirq(HI_SOFTIRQ, tasklet_hi_action);  } @@ -771,6 +649,10 @@ static void run_ksoftirqd(unsigned int cpu)  {  	local_irq_disable();  	if (local_softirq_pending()) { +		/* +		 * We can safely run softirq on inline stack, as we are not deep +		 * in the task stack here. +		 */  		__do_softirq();  		rcu_note_context_switch(cpu);  		local_irq_enable(); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index c09f2955ae30..84571e09c907 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -20,6 +20,7 @@  #include <linux/kallsyms.h>  #include <linux/smpboot.h>  #include <linux/atomic.h> +#include <linux/lglock.h>  /*   * Structure to determine completion condition and record errors.  May @@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);  static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);  static bool stop_machine_initialized = false; +/* + * Avoids a race between stop_two_cpus and global stop_cpus, where + * the stoppers could get queued up in reverse order, leading to + * system deadlock. Using an lglock means stop_two_cpus remains + * relatively cheap. + */ +DEFINE_STATIC_LGLOCK(stop_cpus_lock); +  static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)  {  	memset(done, 0, sizeof(*done)); @@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)  	return done.executed ? done.ret : -ENOENT;  } +/* This controls the threads on each CPU. */ +enum multi_stop_state { +	/* Dummy starting state for thread. */ +	MULTI_STOP_NONE, +	/* Awaiting everyone to be scheduled. */ +	MULTI_STOP_PREPARE, +	/* Disable interrupts. */ +	MULTI_STOP_DISABLE_IRQ, +	/* Run the function */ +	MULTI_STOP_RUN, +	/* Exit */ +	MULTI_STOP_EXIT, +}; + +struct multi_stop_data { +	int			(*fn)(void *); +	void			*data; +	/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ +	unsigned int		num_threads; +	const struct cpumask	*active_cpus; + +	enum multi_stop_state	state; +	atomic_t		thread_ack; +}; + +static void set_state(struct multi_stop_data *msdata, +		      enum multi_stop_state newstate) +{ +	/* Reset ack counter. */ +	atomic_set(&msdata->thread_ack, msdata->num_threads); +	smp_wmb(); +	msdata->state = newstate; +} + +/* Last one to ack a state moves to the next state. */ +static void ack_state(struct multi_stop_data *msdata) +{ +	if (atomic_dec_and_test(&msdata->thread_ack)) +		set_state(msdata, msdata->state + 1); +} + +/* This is the cpu_stop function which stops the CPU. */ +static int multi_cpu_stop(void *data) +{ +	struct multi_stop_data *msdata = data; +	enum multi_stop_state curstate = MULTI_STOP_NONE; +	int cpu = smp_processor_id(), err = 0; +	unsigned long flags; +	bool is_active; + +	/* +	 * When called from stop_machine_from_inactive_cpu(), irq might +	 * already be disabled.  Save the state and restore it on exit. +	 */ +	local_save_flags(flags); + +	if (!msdata->active_cpus) +		is_active = cpu == cpumask_first(cpu_online_mask); +	else +		is_active = cpumask_test_cpu(cpu, msdata->active_cpus); + +	/* Simple state machine */ +	do { +		/* Chill out and ensure we re-read multi_stop_state. */ +		cpu_relax(); +		if (msdata->state != curstate) { +			curstate = msdata->state; +			switch (curstate) { +			case MULTI_STOP_DISABLE_IRQ: +				local_irq_disable(); +				hard_irq_disable(); +				break; +			case MULTI_STOP_RUN: +				if (is_active) +					err = msdata->fn(msdata->data); +				break; +			default: +				break; +			} +			ack_state(msdata); +		} +	} while (curstate != MULTI_STOP_EXIT); + +	local_irq_restore(flags); +	return err; +} + +struct irq_cpu_stop_queue_work_info { +	int cpu1; +	int cpu2; +	struct cpu_stop_work *work1; +	struct cpu_stop_work *work2; +}; + +/* + * This function is always run with irqs and preemption disabled. + * This guarantees that both work1 and work2 get queued, before + * our local migrate thread gets the chance to preempt us. + */ +static void irq_cpu_stop_queue_work(void *arg) +{ +	struct irq_cpu_stop_queue_work_info *info = arg; +	cpu_stop_queue_work(info->cpu1, info->work1); +	cpu_stop_queue_work(info->cpu2, info->work2); +} + +/** + * stop_two_cpus - stops two cpus + * @cpu1: the cpu to stop + * @cpu2: the other cpu to stop + * @fn: function to execute + * @arg: argument to @fn + * + * Stops both the current and specified CPU and runs @fn on one of them. + * + * returns when both are completed. + */ +int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) +{ +	struct cpu_stop_done done; +	struct cpu_stop_work work1, work2; +	struct irq_cpu_stop_queue_work_info call_args; +	struct multi_stop_data msdata; + +	preempt_disable(); +	msdata = (struct multi_stop_data){ +		.fn = fn, +		.data = arg, +		.num_threads = 2, +		.active_cpus = cpumask_of(cpu1), +	}; + +	work1 = work2 = (struct cpu_stop_work){ +		.fn = multi_cpu_stop, +		.arg = &msdata, +		.done = &done +	}; + +	call_args = (struct irq_cpu_stop_queue_work_info){ +		.cpu1 = cpu1, +		.cpu2 = cpu2, +		.work1 = &work1, +		.work2 = &work2, +	}; + +	cpu_stop_init_done(&done, 2); +	set_state(&msdata, MULTI_STOP_PREPARE); + +	/* +	 * If we observe both CPUs active we know _cpu_down() cannot yet have +	 * queued its stop_machine works and therefore ours will get executed +	 * first. Or its not either one of our CPUs that's getting unplugged, +	 * in which case we don't care. +	 * +	 * This relies on the stopper workqueues to be FIFO. +	 */ +	if (!cpu_active(cpu1) || !cpu_active(cpu2)) { +		preempt_enable(); +		return -ENOENT; +	} + +	lg_local_lock(&stop_cpus_lock); +	/* +	 * Queuing needs to be done by the lowest numbered CPU, to ensure +	 * that works are always queued in the same order on every CPU. +	 * This prevents deadlocks. +	 */ +	smp_call_function_single(min(cpu1, cpu2), +				 &irq_cpu_stop_queue_work, +				 &call_args, 0); +	lg_local_unlock(&stop_cpus_lock); +	preempt_enable(); + +	wait_for_completion(&done.completion); + +	return done.executed ? done.ret : -ENOENT; +} +  /**   * stop_one_cpu_nowait - stop a cpu but don't wait for completion   * @cpu: cpu to stop @@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,  	 * preempted by a stopper which might wait for other stoppers  	 * to enter @fn which can lead to deadlock.  	 */ -	preempt_disable(); +	lg_global_lock(&stop_cpus_lock);  	for_each_cpu(cpu, cpumask)  		cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); -	preempt_enable(); +	lg_global_unlock(&stop_cpus_lock);  }  static int __stop_cpus(const struct cpumask *cpumask, @@ -359,98 +546,14 @@ early_initcall(cpu_stop_init);  #ifdef CONFIG_STOP_MACHINE -/* This controls the threads on each CPU. */ -enum stopmachine_state { -	/* Dummy starting state for thread. */ -	STOPMACHINE_NONE, -	/* Awaiting everyone to be scheduled. */ -	STOPMACHINE_PREPARE, -	/* Disable interrupts. */ -	STOPMACHINE_DISABLE_IRQ, -	/* Run the function */ -	STOPMACHINE_RUN, -	/* Exit */ -	STOPMACHINE_EXIT, -}; - -struct stop_machine_data { -	int			(*fn)(void *); -	void			*data; -	/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ -	unsigned int		num_threads; -	const struct cpumask	*active_cpus; - -	enum stopmachine_state	state; -	atomic_t		thread_ack; -}; - -static void set_state(struct stop_machine_data *smdata, -		      enum stopmachine_state newstate) -{ -	/* Reset ack counter. */ -	atomic_set(&smdata->thread_ack, smdata->num_threads); -	smp_wmb(); -	smdata->state = newstate; -} - -/* Last one to ack a state moves to the next state. */ -static void ack_state(struct stop_machine_data *smdata) -{ -	if (atomic_dec_and_test(&smdata->thread_ack)) -		set_state(smdata, smdata->state + 1); -} - -/* This is the cpu_stop function which stops the CPU. */ -static int stop_machine_cpu_stop(void *data) -{ -	struct stop_machine_data *smdata = data; -	enum stopmachine_state curstate = STOPMACHINE_NONE; -	int cpu = smp_processor_id(), err = 0; -	unsigned long flags; -	bool is_active; - -	/* -	 * When called from stop_machine_from_inactive_cpu(), irq might -	 * already be disabled.  Save the state and restore it on exit. -	 */ -	local_save_flags(flags); - -	if (!smdata->active_cpus) -		is_active = cpu == cpumask_first(cpu_online_mask); -	else -		is_active = cpumask_test_cpu(cpu, smdata->active_cpus); - -	/* Simple state machine */ -	do { -		/* Chill out and ensure we re-read stopmachine_state. */ -		cpu_relax(); -		if (smdata->state != curstate) { -			curstate = smdata->state; -			switch (curstate) { -			case STOPMACHINE_DISABLE_IRQ: -				local_irq_disable(); -				hard_irq_disable(); -				break; -			case STOPMACHINE_RUN: -				if (is_active) -					err = smdata->fn(smdata->data); -				break; -			default: -				break; -			} -			ack_state(smdata); -		} -	} while (curstate != STOPMACHINE_EXIT); - -	local_irq_restore(flags); -	return err; -} -  int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)  { -	struct stop_machine_data smdata = { .fn = fn, .data = data, -					    .num_threads = num_online_cpus(), -					    .active_cpus = cpus }; +	struct multi_stop_data msdata = { +		.fn = fn, +		.data = data, +		.num_threads = num_online_cpus(), +		.active_cpus = cpus, +	};  	if (!stop_machine_initialized) {  		/* @@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)  		unsigned long flags;  		int ret; -		WARN_ON_ONCE(smdata.num_threads != 1); +		WARN_ON_ONCE(msdata.num_threads != 1);  		local_irq_save(flags);  		hard_irq_disable(); @@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)  	}  	/* Set the initial state and stop all online cpus. */ -	set_state(&smdata, STOPMACHINE_PREPARE); -	return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); +	set_state(&msdata, MULTI_STOP_PREPARE); +	return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);  }  int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) @@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine);  int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,  				  const struct cpumask *cpus)  { -	struct stop_machine_data smdata = { .fn = fn, .data = data, +	struct multi_stop_data msdata = { .fn = fn, .data = data,  					    .active_cpus = cpus };  	struct cpu_stop_done done;  	int ret;  	/* Local CPU must be inactive and CPU hotplug in progress. */  	BUG_ON(cpu_active(raw_smp_processor_id())); -	smdata.num_threads = num_active_cpus() + 1;	/* +1 for local */ +	msdata.num_threads = num_active_cpus() + 1;	/* +1 for local */  	/* No proper task established and can't sleep - busy wait for lock. */  	while (!mutex_trylock(&stop_cpus_mutex))  		cpu_relax();  	/* Schedule work on other CPUs and execute directly for local CPU */ -	set_state(&smdata, STOPMACHINE_PREPARE); +	set_state(&msdata, MULTI_STOP_PREPARE);  	cpu_stop_init_done(&done, num_active_cpus()); -	queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, +	queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,  			     &done); -	ret = stop_machine_cpu_stop(&smdata); +	ret = multi_cpu_stop(&msdata);  	/* Busy wait for completion. */  	while (!completion_done(&done.completion)) diff --git a/kernel/sys.c b/kernel/sys.c index c18ecca575b4..c72311324ea7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -16,7 +16,6 @@  #include <linux/perf_event.h>  #include <linux/resource.h>  #include <linux/kernel.h> -#include <linux/kexec.h>  #include <linux/workqueue.h>  #include <linux/capability.h>  #include <linux/device.h> diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b2f06f3c6a3f..34a604726d0b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,  #ifdef CONFIG_MAGIC_SYSRQ  /* Note: sysrq code uses it's own private copy */ -static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; +static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;  static int sysrq_sysctl_handler(ctl_table *table, int write,  				void __user *buffer, size_t *lenp, @@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = {  		.proc_handler	= proc_dointvec,  	},  	{ -		.procname	= "numa_balancing_scan_period_reset", -		.data		= &sysctl_numa_balancing_scan_period_reset, -		.maxlen		= sizeof(unsigned int), -		.mode		= 0644, -		.proc_handler	= proc_dointvec, -	}, -	{  		.procname	= "numa_balancing_scan_period_max_ms",  		.data		= &sysctl_numa_balancing_scan_period_max,  		.maxlen		= sizeof(unsigned int), @@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = {  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  	}, +	{ +		.procname       = "numa_balancing_settle_count", +		.data           = &sysctl_numa_balancing_settle_count, +		.maxlen         = sizeof(unsigned int), +		.mode           = 0644, +		.proc_handler   = proc_dointvec, +	}, +	{ +		.procname       = "numa_balancing_migrate_deferred", +		.data           = &sysctl_numa_balancing_migrate_deferred, +		.maxlen         = sizeof(unsigned int), +		.mode           = 0644, +		.proc_handler   = proc_dointvec, +	},  #endif /* CONFIG_NUMA_BALANCING */  #endif /* CONFIG_SCHED_DEBUG */  	{ @@ -962,9 +969,10 @@ static struct ctl_table kern_table[] = {  	{  		.procname	= "hung_task_check_count",  		.data		= &sysctl_hung_task_check_count, -		.maxlen		= sizeof(unsigned long), +		.maxlen		= sizeof(int),  		.mode		= 0644, -		.proc_handler	= proc_doulongvec_minmax, +		.proc_handler	= proc_dointvec_minmax, +		.extra1		= &zero,  	},  	{  		.procname	= "hung_task_timeout_secs", @@ -1049,6 +1057,7 @@ static struct ctl_table kern_table[] = {  		.maxlen		= sizeof(sysctl_perf_event_sample_rate),  		.mode		= 0644,  		.proc_handler	= perf_proc_update_handler, +		.extra1		= &one,  	},  	{  		.procname	= "perf_cpu_time_max_percent", @@ -2214,8 +2223,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int  			*i = val;  		} else {  			val = convdiv * (*i) / convmul; -			if (!first) +			if (!first) {  				err = proc_put_char(&buffer, &left, '\t'); +				if (err) +					break; +			}  			err = proc_put_long(&buffer, &left, val, false);  			if (err)  				break; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b609213ca9a2..653cbbd9e7ad 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1024,7 +1024,7 @@ static ssize_t bin_intvec(struct file *file,  			if (get_user(value, vec + i))  				goto out_kfree; -			str += snprintf(str, end - str, "%lu\t", value); +			str += scnprintf(str, end - str, "%lu\t", value);  		}  		result = kernel_write(file, buffer, str - buffer, 0); @@ -1095,7 +1095,7 @@ static ssize_t bin_ulongvec(struct file *file,  			if (get_user(value, vec + i))  				goto out_kfree; -			str += snprintf(str, end - str, "%lu\t", value); +			str += scnprintf(str, end - str, "%lu\t", value);  		}  		result = kernel_write(file, buffer, str - buffer, 0); @@ -1205,7 +1205,7 @@ static ssize_t bin_dn_node_address(struct file *file,  		if (get_user(dnaddr, (__le16 __user *)newval))  			goto out; -		len = snprintf(buf, sizeof(buf), "%hu.%hu", +		len = scnprintf(buf, sizeof(buf), "%hu.%hu",  				le16_to_cpu(dnaddr) >> 10,  				le16_to_cpu(dnaddr) & 0x3ff); diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S new file mode 100644 index 000000000000..4aef390671cb --- /dev/null +++ b/kernel/system_certificates.S @@ -0,0 +1,10 @@ +#include <linux/export.h> +#include <linux/init.h> + +	__INITRODATA + +	.globl VMLINUX_SYMBOL(system_certificate_list) +VMLINUX_SYMBOL(system_certificate_list): +	.incbin "kernel/x509_certificate_list" +	.globl VMLINUX_SYMBOL(system_certificate_list_end) +VMLINUX_SYMBOL(system_certificate_list_end): diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c new file mode 100644 index 000000000000..564dd93430a2 --- /dev/null +++ b/kernel/system_keyring.c @@ -0,0 +1,105 @@ +/* System trusted keyring for trusted public keys + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/err.h> +#include <keys/asymmetric-type.h> +#include <keys/system_keyring.h> +#include "module-internal.h" + +struct key *system_trusted_keyring; +EXPORT_SYMBOL_GPL(system_trusted_keyring); + +extern __initconst const u8 system_certificate_list[]; +extern __initconst const u8 system_certificate_list_end[]; + +/* + * Load the compiled-in keys + */ +static __init int system_trusted_keyring_init(void) +{ +	pr_notice("Initialise system trusted keyring\n"); + +	system_trusted_keyring = +		keyring_alloc(".system_keyring", +			      KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), +			      ((KEY_POS_ALL & ~KEY_POS_SETATTR) | +			      KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), +			      KEY_ALLOC_NOT_IN_QUOTA, NULL); +	if (IS_ERR(system_trusted_keyring)) +		panic("Can't allocate system trusted keyring\n"); + +	set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); +	return 0; +} + +/* + * Must be initialised before we try and load the keys into the keyring. + */ +device_initcall(system_trusted_keyring_init); + +/* + * Load the compiled-in list of X.509 certificates. + */ +static __init int load_system_certificate_list(void) +{ +	key_ref_t key; +	const u8 *p, *end; +	size_t plen; + +	pr_notice("Loading compiled-in X.509 certificates\n"); + +	end = system_certificate_list_end; +	p = system_certificate_list; +	while (p < end) { +		/* Each cert begins with an ASN.1 SEQUENCE tag and must be more +		 * than 256 bytes in size. +		 */ +		if (end - p < 4) +			goto dodgy_cert; +		if (p[0] != 0x30 && +		    p[1] != 0x82) +			goto dodgy_cert; +		plen = (p[2] << 8) | p[3]; +		plen += 4; +		if (plen > end - p) +			goto dodgy_cert; + +		key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), +					   "asymmetric", +					   NULL, +					   p, +					   plen, +					   ((KEY_POS_ALL & ~KEY_POS_SETATTR) | +					   KEY_USR_VIEW | KEY_USR_READ), +					   KEY_ALLOC_NOT_IN_QUOTA | +					   KEY_ALLOC_TRUSTED); +		if (IS_ERR(key)) { +			pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", +			       PTR_ERR(key)); +		} else { +			pr_notice("Loaded X.509 cert '%s'\n", +				  key_ref_to_ptr(key)->description); +			key_ref_put(key); +		} +		p += plen; +	} + +	return 0; + +dodgy_cert: +	pr_err("Problem parsing in-kernel X.509 certificate list\n"); +	return 0; +} +late_initcall(load_system_certificate_list); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 145bb4d3bd4d..13d2f7cd65db 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -290,6 +290,7 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)  	struct listener_list *listeners;  	struct listener *s, *tmp, *s2;  	unsigned int cpu; +	int ret = 0;  	if (!cpumask_subset(mask, cpu_possible_mask))  		return -EINVAL; @@ -304,9 +305,10 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)  		for_each_cpu(cpu, mask) {  			s = kmalloc_node(sizeof(struct listener),  					GFP_KERNEL, cpu_to_node(cpu)); -			if (!s) +			if (!s) { +				ret = -ENOMEM;  				goto cleanup; - +			}  			s->pid = pid;  			s->valid = 1; @@ -339,7 +341,7 @@ cleanup:  		}  		up_write(&listeners->sem);  	} -	return 0; +	return ret;  }  static int parse(struct nlattr *na, struct cpumask *mask) @@ -404,11 +406,15 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)  	if (!na)  		goto err; -	if (nla_put(skb, type, sizeof(pid), &pid) < 0) +	if (nla_put(skb, type, sizeof(pid), &pid) < 0) { +		nla_nest_cancel(skb, na);  		goto err; +	}  	ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); -	if (!ret) +	if (!ret) { +		nla_nest_cancel(skb, na);  		goto err; +	}  	nla_nest_end(skb, na);  	return nla_data(ret); @@ -667,17 +673,18 @@ err:  	nlmsg_free(rep_skb);  } -static struct genl_ops taskstats_ops = { -	.cmd		= TASKSTATS_CMD_GET, -	.doit		= taskstats_user_cmd, -	.policy		= taskstats_cmd_get_policy, -	.flags		= GENL_ADMIN_PERM, -}; - -static struct genl_ops cgroupstats_ops = { -	.cmd		= CGROUPSTATS_CMD_GET, -	.doit		= cgroupstats_user_cmd, -	.policy		= cgroupstats_cmd_get_policy, +static const struct genl_ops taskstats_ops[] = { +	{ +		.cmd		= TASKSTATS_CMD_GET, +		.doit		= taskstats_user_cmd, +		.policy		= taskstats_cmd_get_policy, +		.flags		= GENL_ADMIN_PERM, +	}, +	{ +		.cmd		= CGROUPSTATS_CMD_GET, +		.doit		= cgroupstats_user_cmd, +		.policy		= cgroupstats_cmd_get_policy, +	},  };  /* Needed early in initialization */ @@ -696,26 +703,13 @@ static int __init taskstats_init(void)  {  	int rc; -	rc = genl_register_family(&family); +	rc = genl_register_family_with_ops(&family, taskstats_ops);  	if (rc)  		return rc; -	rc = genl_register_ops(&family, &taskstats_ops); -	if (rc < 0) -		goto err; - -	rc = genl_register_ops(&family, &cgroupstats_ops); -	if (rc < 0) -		goto err_cgroup_ops; -  	family_registered = 1;  	pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);  	return 0; -err_cgroup_ops: -	genl_unregister_ops(&family, &taskstats_ops); -err: -	genl_unregister_family(&family); -	return rc;  }  /* diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 2b62fe86f9ec..3ce6e8c5f3fc 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -100,7 +100,7 @@ config NO_HZ_FULL  	# RCU_USER_QS dependency  	depends on HAVE_CONTEXT_TRACKING  	# VIRT_CPU_ACCOUNTING_GEN dependency -	depends on 64BIT +	depends on HAVE_VIRT_CPU_ACCOUNTING_GEN  	select NO_HZ_COMMON  	select RCU_USER_QS  	select RCU_NOCB_CPU diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index eec50fcef9e4..88c9c65a430d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)  	clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;  	if (!alarmtimer_get_rtcdev()) -		return -ENOTSUPP; +		return -EINVAL;  	return hrtimer_get_res(baseid, tp);  } @@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)  	struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];  	if (!alarmtimer_get_rtcdev()) -		return -ENOTSUPP; +		return -EINVAL;  	*tp = ktime_to_timespec(base->gettime());  	return 0; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 38959c866789..086ad6043bcb 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -33,29 +33,64 @@ struct ce_unbind {  	int res;  }; -/** - * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds - * @latch:	value to convert - * @evt:	pointer to clock event device descriptor - * - * Math helper, returns latch value converted to nanoseconds (bound checked) - */ -u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, +			bool ismax)  {  	u64 clc = (u64) latch << evt->shift; +	u64 rnd;  	if (unlikely(!evt->mult)) {  		evt->mult = 1;  		WARN_ON(1);  	} +	rnd = (u64) evt->mult - 1; + +	/* +	 * Upper bound sanity check. If the backwards conversion is +	 * not equal latch, we know that the above shift overflowed. +	 */ +	if ((clc >> evt->shift) != (u64)latch) +		clc = ~0ULL; + +	/* +	 * Scaled math oddities: +	 * +	 * For mult <= (1 << shift) we can safely add mult - 1 to +	 * prevent integer rounding loss. So the backwards conversion +	 * from nsec to device ticks will be correct. +	 * +	 * For mult > (1 << shift), i.e. device frequency is > 1GHz we +	 * need to be careful. Adding mult - 1 will result in a value +	 * which when converted back to device ticks can be larger +	 * than latch by up to (mult - 1) >> shift. For the min_delta +	 * calculation we still want to apply this in order to stay +	 * above the minimum device ticks limit. For the upper limit +	 * we would end up with a latch value larger than the upper +	 * limit of the device, so we omit the add to stay below the +	 * device upper boundary. +	 * +	 * Also omit the add if it would overflow the u64 boundary. +	 */ +	if ((~0ULL - clc > rnd) && +	    (!ismax || evt->mult <= (1U << evt->shift))) +		clc += rnd;  	do_div(clc, evt->mult); -	if (clc < 1000) -		clc = 1000; -	if (clc > KTIME_MAX) -		clc = KTIME_MAX; -	return clc; +	/* Deltas less than 1usec are pointless noise */ +	return clc > 1000 ? clc : 1000; +} + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch:	value to convert + * @evt:	pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ +	return cev_delta2ns(latch, evt, false);  }  EXPORT_SYMBOL_GPL(clockevent_delta2ns); @@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq)  		sec = 600;  	clockevents_calc_mult_shift(dev, freq, sec); -	dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); -	dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); +	dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); +	dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true);  }  /** @@ -584,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev,  				     const char *buf, size_t count)  {  	char name[CS_NAME_LEN]; -	size_t ret = sysfs_get_uname(buf, name, count); +	ssize_t ret = sysfs_get_uname(buf, name, count);  	struct clock_event_device *ce;  	if (ret < 0) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 50a8736757f3..ba3e502c955a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }  static inline void clocksource_resume_watchdog(void) { }  static inline int __clocksource_watchdog_kthread(void) { return 0; }  static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } +void clocksource_mark_unstable(struct clocksource *cs) { }  #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)  }  /** - * clocksource_max_deferment - Returns max time the clocksource can be deferred - * @cs:         Pointer to clocksource - * + * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted + * @mult:	cycle to nanosecond multiplier + * @shift:	cycle to nanosecond divisor (power of two) + * @maxadj:	maximum adjustment value to mult (~11%) + * @mask:	bitmask for two's complement subtraction of non 64 bit counters   */ -static u64 clocksource_max_deferment(struct clocksource *cs) +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)  {  	u64 max_nsecs, max_cycles;  	/*  	 * Calculate the maximum number of cycles that we can pass to the  	 * cyc2ns function without overflowing a 64-bit signed result. The -	 * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) +	 * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)  	 * which is equivalent to the below. -	 * max_cycles < (2^63)/(cs->mult + cs->maxadj) -	 * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) -	 * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) -	 * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) -	 * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) +	 * max_cycles < (2^63)/(mult + maxadj) +	 * max_cycles < 2^(log2((2^63)/(mult + maxadj))) +	 * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) +	 * max_cycles < 2^(63 - log2(mult + maxadj)) +	 * max_cycles < 1 << (63 - log2(mult + maxadj))  	 * Please note that we add 1 to the result of the log2 to account for  	 * any rounding errors, ensure the above inequality is satisfied and  	 * no overflow will occur.  	 */ -	max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); +	max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));  	/*  	 * The actual maximum number of cycles we can defer the clocksource is -	 * determined by the minimum of max_cycles and cs->mask. +	 * determined by the minimum of max_cycles and mask.  	 * Note: Here we subtract the maxadj to make sure we don't sleep for  	 * too long if there's a large negative adjustment.  	 */ -	max_cycles = min_t(u64, max_cycles, (u64) cs->mask); -	max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, -					cs->shift); +	max_cycles = min(max_cycles, mask); +	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + +	return max_nsecs; +} + +/** + * clocksource_max_deferment - Returns max time the clocksource can be deferred + * @cs:         Pointer to clocksource + * + */ +static u64 clocksource_max_deferment(struct clocksource *cs) +{ +	u64 max_nsecs; +	max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, +					  cs->mask);  	/*  	 * To ensure that the clocksource does not wrap whilst we are idle,  	 * limit the time the clocksource can be deferred by 12.5%. Please @@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev,  	return count;  } -size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)  {  	size_t ret = cnt; @@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,  					  struct device_attribute *attr,  					  const char *buf, size_t count)  { -	size_t ret; +	ssize_t ret;  	mutex_lock(&clocksource_mutex); @@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,  {  	struct clocksource *cs;  	char name[CS_NAME_LEN]; -	size_t ret; +	ssize_t ret;  	ret = sysfs_get_uname(buf, name, count);  	if (ret < 0) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index bb2215174f05..af8d1d4f3d55 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work)  	 * called as close as possible to 500 ms before the new second starts.  	 * This code is run on a timer.  If the clock is set, that timer  	 * may not expire at the correct time.  Thus, we adjust... +	 * We want the clock to be within a couple of ticks from the target.  	 */  	if (!ntp_synced()) {  		/* @@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work)  	}  	getnstimeofday(&now); -	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { +	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {  		struct timespec adjust = now;  		fail = -ENODEV; diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0b479a6a22bb..68b799375981 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -8,25 +8,28 @@  #include <linux/clocksource.h>  #include <linux/init.h>  #include <linux/jiffies.h> +#include <linux/ktime.h>  #include <linux/kernel.h>  #include <linux/moduleparam.h>  #include <linux/sched.h>  #include <linux/syscore_ops.h> -#include <linux/timer.h> +#include <linux/hrtimer.h>  #include <linux/sched_clock.h> +#include <linux/seqlock.h> +#include <linux/bitops.h>  struct clock_data { +	ktime_t wrap_kt;  	u64 epoch_ns; -	u32 epoch_cyc; -	u32 epoch_cyc_copy; +	u64 epoch_cyc; +	seqcount_t seq;  	unsigned long rate;  	u32 mult;  	u32 shift;  	bool suspended;  }; -static void sched_clock_poll(unsigned long wrap_ticks); -static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); +static struct hrtimer sched_clock_timer;  static int irqtime = -1;  core_param(irqtime, irqtime, int, 0400); @@ -35,42 +38,46 @@ static struct clock_data cd = {  	.mult	= NSEC_PER_SEC / HZ,  }; -static u32 __read_mostly sched_clock_mask = 0xffffffff; +static u64 __read_mostly sched_clock_mask; -static u32 notrace jiffy_sched_clock_read(void) +static u64 notrace jiffy_sched_clock_read(void)  { -	return (u32)(jiffies - INITIAL_JIFFIES); +	/* +	 * We don't need to use get_jiffies_64 on 32-bit arches here +	 * because we register with BITS_PER_LONG +	 */ +	return (u64)(jiffies - INITIAL_JIFFIES);  } -static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static u32 __read_mostly (*read_sched_clock_32)(void); + +static u64 notrace read_sched_clock_32_wrapper(void) +{ +	return read_sched_clock_32(); +} + +static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;  static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)  {  	return (cyc * mult) >> shift;  } -static unsigned long long notrace sched_clock_32(void) +unsigned long long notrace sched_clock(void)  {  	u64 epoch_ns; -	u32 epoch_cyc; -	u32 cyc; +	u64 epoch_cyc; +	u64 cyc; +	unsigned long seq;  	if (cd.suspended)  		return cd.epoch_ns; -	/* -	 * Load the epoch_cyc and epoch_ns atomically.  We do this by -	 * ensuring that we always write epoch_cyc, epoch_ns and -	 * epoch_cyc_copy in strict order, and read them in strict order. -	 * If epoch_cyc and epoch_cyc_copy are not equal, then we're in -	 * the middle of an update, and we should repeat the load. -	 */  	do { +		seq = read_seqcount_begin(&cd.seq);  		epoch_cyc = cd.epoch_cyc; -		smp_rmb();  		epoch_ns = cd.epoch_ns; -		smp_rmb(); -	} while (epoch_cyc != cd.epoch_cyc_copy); +	} while (read_seqcount_retry(&cd.seq, seq));  	cyc = read_sched_clock();  	cyc = (cyc - epoch_cyc) & sched_clock_mask; @@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void)  static void notrace update_sched_clock(void)  {  	unsigned long flags; -	u32 cyc; +	u64 cyc;  	u64 ns;  	cyc = read_sched_clock();  	ns = cd.epoch_ns +  		cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,  			  cd.mult, cd.shift); -	/* -	 * Write epoch_cyc and epoch_ns in a way that the update is -	 * detectable in cyc_to_fixed_sched_clock(). -	 */ +  	raw_local_irq_save(flags); -	cd.epoch_cyc_copy = cyc; -	smp_wmb(); +	write_seqcount_begin(&cd.seq);  	cd.epoch_ns = ns; -	smp_wmb();  	cd.epoch_cyc = cyc; +	write_seqcount_end(&cd.seq);  	raw_local_irq_restore(flags);  } -static void sched_clock_poll(unsigned long wrap_ticks) +static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)  { -	mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));  	update_sched_clock(); +	hrtimer_forward_now(hrt, cd.wrap_kt); +	return HRTIMER_RESTART;  } -void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) +void __init sched_clock_register(u64 (*read)(void), int bits, +				 unsigned long rate)  { -	unsigned long r, w; +	unsigned long r;  	u64 res, wrap;  	char r_unit;  	if (cd.rate > rate)  		return; -	BUG_ON(bits > 32);  	WARN_ON(!irqs_disabled());  	read_sched_clock = read; -	sched_clock_mask = (1ULL << bits) - 1; +	sched_clock_mask = CLOCKSOURCE_MASK(bits);  	cd.rate = rate;  	/* calculate the mult/shift to convert counter ticks to ns. */ -	clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); +	clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);  	r = rate;  	if (r >= 4000000) { @@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)  		r_unit = ' ';  	/* calculate how many ns until we wrap */ -	wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); -	do_div(wrap, NSEC_PER_MSEC); -	w = wrap; +	wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask); +	cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));  	/* calculate the ns resolution of this counter */  	res = cyc_to_ns(1ULL, cd.mult, cd.shift); -	pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", -		bits, r, r_unit, res, w); +	pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", +		bits, r, r_unit, res, wrap); -	/* -	 * Start the timer to keep sched_clock() properly updated and -	 * sets the initial epoch. -	 */ -	sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));  	update_sched_clock();  	/* @@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)  	pr_debug("Registered %pF as sched_clock source\n", read);  } -unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; - -unsigned long long notrace sched_clock(void) +void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)  { -	return sched_clock_func(); +	read_sched_clock_32 = read; +	sched_clock_register(read_sched_clock_32_wrapper, bits, rate);  }  void __init sched_clock_postinit(void) @@ -180,14 +177,22 @@ void __init sched_clock_postinit(void)  	 * make it the final one one.  	 */  	if (read_sched_clock == jiffy_sched_clock_read) -		setup_sched_clock(jiffy_sched_clock_read, 32, HZ); +		sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); -	sched_clock_poll(sched_clock_timer.data); +	update_sched_clock(); + +	/* +	 * Start the timer to keep sched_clock() properly updated and +	 * sets the initial epoch. +	 */ +	hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	sched_clock_timer.function = sched_clock_poll; +	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);  }  static int sched_clock_suspend(void)  { -	sched_clock_poll(sched_clock_timer.data); +	sched_clock_poll(&sched_clock_timer);  	cd.suspended = true;  	return 0;  } @@ -195,7 +200,6 @@ static int sched_clock_suspend(void)  static void sched_clock_resume(void)  {  	cd.epoch_cyc = read_sched_clock(); -	cd.epoch_cyc_copy = cd.epoch_cyc;  	cd.suspended = false;  } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 218bcb565fed..9532690daaa9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev,  					struct clock_event_device *newdev)  {  	if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || +	    (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||  	    (newdev->features & CLOCK_EVT_FEAT_C3STOP))  		return false; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index bc906cad709b..18e71f7fbc2a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev);  extern void clockevents_shutdown(struct clock_event_device *dev); -extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); +extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);  /*   * NO_HZ / high resolution timer shared code diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 947ba25a95a0..3abf53418b67 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1613,9 +1613,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,   * ktime_get_update_offsets - hrtimer helper   * @offs_real:	pointer to storage for monotonic -> realtime offset   * @offs_boot:	pointer to storage for monotonic -> boottime offset + * @offs_tai:	pointer to storage for monotonic -> clock tai offset   *   * Returns current monotonic time and updates the offsets - * Called from hrtimer_interupt() or retrigger_next_event() + * Called from hrtimer_interrupt() or retrigger_next_event()   */  ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,  							ktime_t *offs_tai) diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 0b537f27b559..1fb08f21302e 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v)  	period = ktime_to_timespec(time);  	ms = period.tv_nsec / 1000000; -	seq_puts(m, "Timer Stats Version: v0.2\n"); +	seq_puts(m, "Timer Stats Version: v0.3\n");  	seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);  	if (atomic_read(&overflow_count)) -		seq_printf(m, "Overflow: %d entries\n", -			atomic_read(&overflow_count)); +		seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count)); +	seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");  	for (i = 0; i < nr_entries; i++) {  		entry = entries + i; - 		if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { +		if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {  			seq_printf(m, "%4luD, %5d %-16s ",  				entry->count, entry->pid, entry->comm);  		} else { diff --git a/kernel/timer.c b/kernel/timer.c index 4296d13db3d1..6582b82fa966 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)  static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),  			  unsigned long data)  { -	int preempt_count = preempt_count(); +	int count = preempt_count();  #ifdef CONFIG_LOCKDEP  	/* @@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),  	lock_map_release(&lockdep_map); -	if (preempt_count != preempt_count()) { +	if (count != preempt_count()) {  		WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", -			  fn, preempt_count, preempt_count()); +			  fn, count, preempt_count());  		/*  		 * Restore the preempt count. That gives us a decent  		 * chance to survive and extract information. If the  		 * callback kept a lock held, bad luck, but not worse  		 * than the BUG() we had.  		 */ -		preempt_count() = preempt_count; +		preempt_count_set(count);  	}  } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b8b8560bfb95..f785aef65799 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -26,6 +26,7 @@  #include <linux/export.h>  #include <linux/time.h>  #include <linux/uaccess.h> +#include <linux/list.h>  #include <trace/events/block.h> @@ -38,6 +39,9 @@ static unsigned int blktrace_seq __read_mostly = 1;  static struct trace_array *blk_tr;  static bool blk_tracer_enabled __read_mostly; +static LIST_HEAD(running_trace_list); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); +  /* Select an alternative, minimalistic output than the original one */  #define TRACE_BLK_OPT_CLASSIC	0x1 @@ -107,10 +111,18 @@ record_it:   * Send out a notify for this process, if we haven't done so since a trace   * started   */ -static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +static void trace_note_tsk(struct task_struct *tsk)  { +	unsigned long flags; +	struct blk_trace *bt; +  	tsk->btrace_seq = blktrace_seq; -	trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); +	spin_lock_irqsave(&running_trace_lock, flags); +	list_for_each_entry(bt, &running_trace_list, running_list) { +		trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, +			   sizeof(tsk->comm)); +	} +	spin_unlock_irqrestore(&running_trace_lock, flags);  }  static void trace_note_time(struct blk_trace *bt) @@ -229,16 +241,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,  		goto record_it;  	} +	if (unlikely(tsk->btrace_seq != blktrace_seq)) +		trace_note_tsk(tsk); +  	/*  	 * A word about the locking here - we disable interrupts to reserve  	 * some space in the relay per-cpu buffer, to prevent an irq  	 * from coming in and stepping on our toes.  	 */  	local_irq_save(flags); - -	if (unlikely(tsk->btrace_seq != blktrace_seq)) -		trace_note_tsk(bt, tsk); -  	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);  	if (t) {  		sequence = per_cpu_ptr(bt->sequence, cpu); @@ -477,6 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,  	bt->dir = dir;  	bt->dev = dev;  	atomic_set(&bt->dropped, 0); +	INIT_LIST_HEAD(&bt->running_list);  	ret = -EIO;  	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, @@ -567,13 +579,12 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,  		.end_lba = cbuts.end_lba,  		.pid = cbuts.pid,  	}; -	memcpy(&buts.name, &cbuts.name, 32);  	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);  	if (ret)  		return ret; -	if (copy_to_user(arg, &buts.name, 32)) { +	if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {  		blk_trace_remove(q);  		return -EFAULT;  	} @@ -601,6 +612,9 @@ int blk_trace_startstop(struct request_queue *q, int start)  			blktrace_seq++;  			smp_mb();  			bt->trace_state = Blktrace_running; +			spin_lock_irq(&running_trace_lock); +			list_add(&bt->running_list, &running_trace_list); +			spin_unlock_irq(&running_trace_lock);  			trace_note_time(bt);  			ret = 0; @@ -608,6 +622,9 @@ int blk_trace_startstop(struct request_queue *q, int start)  	} else {  		if (bt->trace_state == Blktrace_running) {  			bt->trace_state = Blktrace_stopped; +			spin_lock_irq(&running_trace_lock); +			list_del_init(&bt->running_list); +			spin_unlock_irq(&running_trace_lock);  			relay_flush(bt->rchan);  			ret = 0;  		} @@ -1472,6 +1489,9 @@ static int blk_trace_remove_queue(struct request_queue *q)  	if (atomic_dec_and_test(&blk_probes_ref))  		blk_unregister_tracepoints(); +	spin_lock_irq(&running_trace_lock); +	list_del(&bt->running_list); +	spin_unlock_irq(&running_trace_lock);  	blk_trace_free(bt);  	return 0;  } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 03cf44ac54d3..0e9f9eaade2f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -367,9 +367,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,  static int __register_ftrace_function(struct ftrace_ops *ops)  { -	if (unlikely(ftrace_disabled)) -		return -ENODEV; -  	if (FTRACE_WARN_ON(ops == &global_ops))  		return -EINVAL; @@ -428,9 +425,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)  {  	int ret; -	if (ftrace_disabled) -		return -ENODEV; -  	if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))  		return -EBUSY; @@ -2088,10 +2082,15 @@ static void ftrace_startup_enable(int command)  static int ftrace_startup(struct ftrace_ops *ops, int command)  {  	bool hash_enable = true; +	int ret;  	if (unlikely(ftrace_disabled))  		return -ENODEV; +	ret = __register_ftrace_function(ops); +	if (ret) +		return ret; +  	ftrace_start_up++;  	command |= FTRACE_UPDATE_CALLS; @@ -2113,12 +2112,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)  	return 0;  } -static void ftrace_shutdown(struct ftrace_ops *ops, int command) +static int ftrace_shutdown(struct ftrace_ops *ops, int command)  {  	bool hash_disable = true; +	int ret;  	if (unlikely(ftrace_disabled)) -		return; +		return -ENODEV; + +	ret = __unregister_ftrace_function(ops); +	if (ret) +		return ret;  	ftrace_start_up--;  	/* @@ -2153,9 +2157,10 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)  	}  	if (!command || !ftrace_enabled) -		return; +		return 0;  	ftrace_run_update_code(command); +	return 0;  }  static void ftrace_startup_sysctl(void) @@ -3060,16 +3065,13 @@ static void __enable_ftrace_function_probe(void)  	if (i == FTRACE_FUNC_HASHSIZE)  		return; -	ret = __register_ftrace_function(&trace_probe_ops); -	if (!ret) -		ret = ftrace_startup(&trace_probe_ops, 0); +	ret = ftrace_startup(&trace_probe_ops, 0);  	ftrace_probe_registered = 1;  }  static void __disable_ftrace_function_probe(void)  { -	int ret;  	int i;  	if (!ftrace_probe_registered) @@ -3082,9 +3084,7 @@ static void __disable_ftrace_function_probe(void)  	}  	/* no more funcs left */ -	ret = __unregister_ftrace_function(&trace_probe_ops); -	if (!ret) -		ftrace_shutdown(&trace_probe_ops, 0); +	ftrace_shutdown(&trace_probe_ops, 0);  	ftrace_probe_registered = 0;  } @@ -3307,7 +3307,11 @@ void unregister_ftrace_function_probe_all(char *glob)  static LIST_HEAD(ftrace_commands);  static DEFINE_MUTEX(ftrace_cmd_mutex); -int register_ftrace_command(struct ftrace_func_command *cmd) +/* + * Currently we only register ftrace commands from __init, so mark this + * __init too. + */ +__init int register_ftrace_command(struct ftrace_func_command *cmd)  {  	struct ftrace_func_command *p;  	int ret = 0; @@ -3326,7 +3330,11 @@ int register_ftrace_command(struct ftrace_func_command *cmd)  	return ret;  } -int unregister_ftrace_command(struct ftrace_func_command *cmd) +/* + * Currently we only unregister ftrace commands from __init, so mark + * this __init too. + */ +__init int unregister_ftrace_command(struct ftrace_func_command *cmd)  {  	struct ftrace_func_command *p, *n;  	int ret = -ENODEV; @@ -3641,7 +3649,7 @@ __setup("ftrace_filter=", set_ftrace_filter);  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; -static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);  static int __init set_graph_function(char *str)  { @@ -3659,7 +3667,7 @@ static void __init set_ftrace_early_graph(char *buf)  		func = strsep(&buf, ",");  		/* we allow only one expression at a time */  		ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, -				      func); +				      FTRACE_GRAPH_MAX_FUNCS, func);  		if (ret)  			printk(KERN_DEBUG "ftrace: function %s not "  					  "traceable\n", func); @@ -3776,15 +3784,25 @@ static const struct file_operations ftrace_notrace_fops = {  static DEFINE_MUTEX(graph_lock);  int ftrace_graph_count; -int ftrace_graph_filter_enabled; +int ftrace_graph_notrace_count;  unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; +unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; + +struct ftrace_graph_data { +	unsigned long *table; +	size_t size; +	int *count; +	const struct seq_operations *seq_ops; +};  static void *  __g_next(struct seq_file *m, loff_t *pos)  { -	if (*pos >= ftrace_graph_count) +	struct ftrace_graph_data *fgd = m->private; + +	if (*pos >= *fgd->count)  		return NULL; -	return &ftrace_graph_funcs[*pos]; +	return &fgd->table[*pos];  }  static void * @@ -3796,10 +3814,12 @@ g_next(struct seq_file *m, void *v, loff_t *pos)  static void *g_start(struct seq_file *m, loff_t *pos)  { +	struct ftrace_graph_data *fgd = m->private; +  	mutex_lock(&graph_lock);  	/* Nothing, tell g_show to print all functions are enabled */ -	if (!ftrace_graph_filter_enabled && !*pos) +	if (!*fgd->count && !*pos)  		return (void *)1;  	return __g_next(m, pos); @@ -3835,38 +3855,88 @@ static const struct seq_operations ftrace_graph_seq_ops = {  };  static int -ftrace_graph_open(struct inode *inode, struct file *file) +__ftrace_graph_open(struct inode *inode, struct file *file, +		    struct ftrace_graph_data *fgd)  {  	int ret = 0; -	if (unlikely(ftrace_disabled)) -		return -ENODEV; -  	mutex_lock(&graph_lock);  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC)) { -		ftrace_graph_filter_enabled = 0; -		ftrace_graph_count = 0; -		memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); +		*fgd->count = 0; +		memset(fgd->table, 0, fgd->size * sizeof(*fgd->table));  	}  	mutex_unlock(&graph_lock); -	if (file->f_mode & FMODE_READ) -		ret = seq_open(file, &ftrace_graph_seq_ops); +	if (file->f_mode & FMODE_READ) { +		ret = seq_open(file, fgd->seq_ops); +		if (!ret) { +			struct seq_file *m = file->private_data; +			m->private = fgd; +		} +	} else +		file->private_data = fgd;  	return ret;  }  static int +ftrace_graph_open(struct inode *inode, struct file *file) +{ +	struct ftrace_graph_data *fgd; + +	if (unlikely(ftrace_disabled)) +		return -ENODEV; + +	fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); +	if (fgd == NULL) +		return -ENOMEM; + +	fgd->table = ftrace_graph_funcs; +	fgd->size = FTRACE_GRAPH_MAX_FUNCS; +	fgd->count = &ftrace_graph_count; +	fgd->seq_ops = &ftrace_graph_seq_ops; + +	return __ftrace_graph_open(inode, file, fgd); +} + +static int +ftrace_graph_notrace_open(struct inode *inode, struct file *file) +{ +	struct ftrace_graph_data *fgd; + +	if (unlikely(ftrace_disabled)) +		return -ENODEV; + +	fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); +	if (fgd == NULL) +		return -ENOMEM; + +	fgd->table = ftrace_graph_notrace_funcs; +	fgd->size = FTRACE_GRAPH_MAX_FUNCS; +	fgd->count = &ftrace_graph_notrace_count; +	fgd->seq_ops = &ftrace_graph_seq_ops; + +	return __ftrace_graph_open(inode, file, fgd); +} + +static int  ftrace_graph_release(struct inode *inode, struct file *file)  { -	if (file->f_mode & FMODE_READ) +	if (file->f_mode & FMODE_READ) { +		struct seq_file *m = file->private_data; + +		kfree(m->private);  		seq_release(inode, file); +	} else { +		kfree(file->private_data); +	} +  	return 0;  }  static int -ftrace_set_func(unsigned long *array, int *idx, char *buffer) +ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)  {  	struct dyn_ftrace *rec;  	struct ftrace_page *pg; @@ -3879,7 +3949,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)  	/* decode regex */  	type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); -	if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) +	if (!not && *idx >= size)  		return -EBUSY;  	search_len = strlen(search); @@ -3907,7 +3977,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)  				fail = 0;  				if (!exists) {  					array[(*idx)++] = rec->ip; -					if (*idx >= FTRACE_GRAPH_MAX_FUNCS) +					if (*idx >= size)  						goto out;  				}  			} else { @@ -3925,8 +3995,6 @@ out:  	if (fail)  		return -EINVAL; -	ftrace_graph_filter_enabled = !!(*idx); -  	return 0;  } @@ -3935,36 +4003,33 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,  		   size_t cnt, loff_t *ppos)  {  	struct trace_parser parser; -	ssize_t read, ret; +	ssize_t read, ret = 0; +	struct ftrace_graph_data *fgd = file->private_data;  	if (!cnt)  		return 0; -	mutex_lock(&graph_lock); - -	if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { -		ret = -ENOMEM; -		goto out_unlock; -	} +	if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) +		return -ENOMEM;  	read = trace_get_user(&parser, ubuf, cnt, ppos);  	if (read >= 0 && trace_parser_loaded((&parser))) {  		parser.buffer[parser.idx] = 0; +		mutex_lock(&graph_lock); +  		/* we allow only one expression at a time */ -		ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, -					parser.buffer); -		if (ret) -			goto out_free; +		ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, +				      parser.buffer); + +		mutex_unlock(&graph_lock);  	} -	ret = read; +	if (!ret) +		ret = read; -out_free:  	trace_parser_put(&parser); -out_unlock: -	mutex_unlock(&graph_lock);  	return ret;  } @@ -3976,6 +4041,14 @@ static const struct file_operations ftrace_graph_fops = {  	.llseek		= ftrace_filter_lseek,  	.release	= ftrace_graph_release,  }; + +static const struct file_operations ftrace_graph_notrace_fops = { +	.open		= ftrace_graph_notrace_open, +	.read		= seq_read, +	.write		= ftrace_graph_write, +	.llseek		= ftrace_filter_lseek, +	.release	= ftrace_graph_release, +};  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */  static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) @@ -3997,6 +4070,9 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)  	trace_create_file("set_graph_function", 0444, d_tracer,  				    NULL,  				    &ftrace_graph_fops); +	trace_create_file("set_graph_notrace", 0444, d_tracer, +				    NULL, +				    &ftrace_graph_notrace_fops);  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */  	return 0; @@ -4290,12 +4366,15 @@ core_initcall(ftrace_nodyn_init);  static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }  static inline void ftrace_startup_enable(int command) { }  /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(ops, command)			\ -	({						\ -		(ops)->flags |= FTRACE_OPS_FL_ENABLED;	\ -		0;					\ +# define ftrace_startup(ops, command)					\ +	({								\ +		int ___ret = __register_ftrace_function(ops);		\ +		if (!___ret)						\ +			(ops)->flags |= FTRACE_OPS_FL_ENABLED;		\ +		___ret;							\  	}) -# define ftrace_shutdown(ops, command)	do { } while (0) +# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) +  # define ftrace_startup_sysctl()	do { } while (0)  # define ftrace_shutdown_sysctl()	do { } while (0) @@ -4320,12 +4399,21 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,  	 */  	preempt_disable_notrace();  	trace_recursion_set(TRACE_CONTROL_BIT); + +	/* +	 * Control funcs (perf) uses RCU. Only trace if +	 * RCU is currently active. +	 */ +	if (!rcu_is_watching()) +		goto out; +  	do_for_each_ftrace_op(op, ftrace_control_list) {  		if (!(op->flags & FTRACE_OPS_FL_STUB) &&  		    !ftrace_function_local_disabled(op) &&  		    ftrace_ops_test(op, ip, regs))  			op->func(ip, parent_ip, op, regs);  	} while_for_each_ftrace_op(op); + out:  	trace_recursion_clear(TRACE_CONTROL_BIT);  	preempt_enable_notrace();  } @@ -4695,9 +4783,7 @@ int register_ftrace_function(struct ftrace_ops *ops)  	mutex_lock(&ftrace_lock); -	ret = __register_ftrace_function(ops); -	if (!ret) -		ret = ftrace_startup(ops, 0); +	ret = ftrace_startup(ops, 0);  	mutex_unlock(&ftrace_lock); @@ -4716,9 +4802,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)  	int ret;  	mutex_lock(&ftrace_lock); -	ret = __unregister_ftrace_function(ops); -	if (!ret) -		ftrace_shutdown(ops, 0); +	ret = ftrace_shutdown(ops, 0);  	mutex_unlock(&ftrace_lock);  	return ret; @@ -4912,6 +4996,13 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,  	return NOTIFY_DONE;  } +/* Just a place holder for function graph */ +static struct ftrace_ops fgraph_ops __read_mostly = { +	.func		= ftrace_stub, +	.flags		= FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | +				FTRACE_OPS_FL_RECURSION_SAFE, +}; +  int register_ftrace_graph(trace_func_graph_ret_t retfunc,  			trace_func_graph_ent_t entryfunc)  { @@ -4938,7 +5029,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,  	ftrace_graph_return = retfunc;  	ftrace_graph_entry = entryfunc; -	ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); +	ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);  out:  	mutex_unlock(&ftrace_lock); @@ -4955,7 +5046,7 @@ void unregister_ftrace_graph(void)  	ftrace_graph_active--;  	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;  	ftrace_graph_entry = ftrace_graph_entry_stub; -	ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); +	ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);  	unregister_pm_notifier(&ftrace_suspend_notifier);  	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7974ba20557d..9d20cd9743ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -235,13 +235,33 @@ void trace_array_put(struct trace_array *this_tr)  	mutex_unlock(&trace_types_lock);  } -int filter_current_check_discard(struct ring_buffer *buffer, -				 struct ftrace_event_call *call, void *rec, -				 struct ring_buffer_event *event) +int filter_check_discard(struct ftrace_event_file *file, void *rec, +			 struct ring_buffer *buffer, +			 struct ring_buffer_event *event)  { -	return filter_check_discard(call, rec, buffer, event); +	if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) && +	    !filter_match_preds(file->filter, rec)) { +		ring_buffer_discard_commit(buffer, event); +		return 1; +	} + +	return 0; +} +EXPORT_SYMBOL_GPL(filter_check_discard); + +int call_filter_check_discard(struct ftrace_event_call *call, void *rec, +			      struct ring_buffer *buffer, +			      struct ring_buffer_event *event) +{ +	if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && +	    !filter_match_preds(call->filter, rec)) { +		ring_buffer_discard_commit(buffer, event); +		return 1; +	} + +	return 0;  } -EXPORT_SYMBOL_GPL(filter_current_check_discard); +EXPORT_SYMBOL_GPL(call_filter_check_discard);  cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)  { @@ -843,9 +863,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,  	if (isspace(ch)) {  		parser->buffer[parser->idx] = 0;  		parser->cont = false; -	} else { +	} else if (parser->idx < parser->size - 1) {  		parser->cont = true;  		parser->buffer[parser->idx++] = ch; +	} else { +		ret = -EINVAL; +		goto out;  	}  	*ppos += read; @@ -1261,21 +1284,6 @@ int is_tracing_stopped(void)  }  /** - * ftrace_off_permanent - disable all ftrace code permanently - * - * This should only be called when a serious anomally has - * been detected.  This will turn off the function tracing, - * ring buffers, and other tracing utilites. It takes no - * locks and can be called from any context. - */ -void ftrace_off_permanent(void) -{ -	tracing_disabled = 1; -	ftrace_stop(); -	tracing_off_permanent(); -} - -/**   * tracing_start - quick start of the tracer   *   * If tracing is enabled but was stopped by tracing_stop, @@ -1509,7 +1517,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,  #endif  		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |  		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | -		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); +		(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | +		(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);  }  EXPORT_SYMBOL_GPL(tracing_generic_entry_update); @@ -1630,7 +1639,7 @@ trace_function(struct trace_array *tr,  	entry->ip			= ip;  	entry->parent_ip		= parent_ip; -	if (!filter_check_discard(call, entry, buffer, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		__buffer_unlock_commit(buffer, event);  } @@ -1714,7 +1723,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,  	entry->size = trace.nr_entries; -	if (!filter_check_discard(call, entry, buffer, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		__buffer_unlock_commit(buffer, event);   out: @@ -1816,7 +1825,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)  	trace.entries		= entry->caller;  	save_stack_trace_user(&trace); -	if (!filter_check_discard(call, entry, buffer, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		__buffer_unlock_commit(buffer, event);   out_drop_count: @@ -2008,7 +2017,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)  	entry->fmt			= fmt;  	memcpy(entry->buf, tbuffer, sizeof(u32) * len); -	if (!filter_check_discard(call, entry, buffer, event)) { +	if (!call_filter_check_discard(call, entry, buffer, event)) {  		__buffer_unlock_commit(buffer, event);  		ftrace_trace_stack(buffer, flags, 6, pc);  	} @@ -2063,7 +2072,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,  	memcpy(&entry->buf, tbuffer, len);  	entry->buf[len] = '\0'; -	if (!filter_check_discard(call, entry, buffer, event)) { +	if (!call_filter_check_discard(call, entry, buffer, event)) {  		__buffer_unlock_commit(buffer, event);  		ftrace_trace_stack(buffer, flags, 6, pc);  	} @@ -2760,7 +2769,7 @@ static void show_snapshot_main_help(struct seq_file *m)  	seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");  	seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");  	seq_printf(m, "#                      Takes a snapshot of the main buffer.\n"); -	seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); +	seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n");  	seq_printf(m, "#                      (Doesn't have to be '2' works with any number that\n");  	seq_printf(m, "#                       is not a '0' or '1')\n");  } @@ -2964,6 +2973,11 @@ int tracing_open_generic(struct inode *inode, struct file *filp)  	return 0;  } +bool tracing_is_disabled(void) +{ +	return (tracing_disabled) ? true: false; +} +  /*   * Open and update trace_array ref count.   * Must have the current trace_array passed to it. @@ -5454,12 +5468,12 @@ static struct ftrace_func_command ftrace_snapshot_cmd = {  	.func			= ftrace_trace_snapshot_callback,  }; -static int register_snapshot_cmd(void) +static __init int register_snapshot_cmd(void)  {  	return register_ftrace_command(&ftrace_snapshot_cmd);  }  #else -static inline int register_snapshot_cmd(void) { return 0; } +static inline __init int register_snapshot_cmd(void) { return 0; }  #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */  struct dentry *tracing_init_dentry_tr(struct trace_array *tr) @@ -6253,6 +6267,17 @@ void trace_init_global_iter(struct trace_iterator *iter)  	iter->trace = iter->tr->current_trace;  	iter->cpu_file = RING_BUFFER_ALL_CPUS;  	iter->trace_buffer = &global_trace.trace_buffer; + +	if (iter->trace && iter->trace->open) +		iter->trace->open(iter); + +	/* Annotate start of buffers if we had overruns */ +	if (ring_buffer_overruns(iter->trace_buffer->buffer)) +		iter->iter_flags |= TRACE_FILE_ANNOTATE; + +	/* Output in nanoseconds only if we are using a clock in nanoseconds. */ +	if (trace_clocks[iter->tr->clock_id].in_ns) +		iter->iter_flags |= TRACE_FILE_TIME_IN_NS;  }  void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 10c86fb7a2b4..ea189e027b80 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -124,6 +124,7 @@ enum trace_flag_type {  	TRACE_FLAG_NEED_RESCHED		= 0x04,  	TRACE_FLAG_HARDIRQ		= 0x08,  	TRACE_FLAG_SOFTIRQ		= 0x10, +	TRACE_FLAG_PREEMPT_RESCHED	= 0x20,  };  #define TRACE_BUF_SIZE		1024 @@ -192,8 +193,8 @@ struct trace_array {  #ifdef CONFIG_FTRACE_SYSCALLS  	int			sys_refcount_enter;  	int			sys_refcount_exit; -	DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); -	DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +	struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls]; +	struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls];  #endif  	int			stop_count;  	int			clock_id; @@ -514,6 +515,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf);  void tracing_reset_current(int cpu);  void tracing_reset_all_online_cpus(void);  int tracing_open_generic(struct inode *inode, struct file *filp); +bool tracing_is_disabled(void);  struct dentry *trace_create_file(const char *name,  				 umode_t mode,  				 struct dentry *parent, @@ -711,6 +713,8 @@ extern unsigned long trace_flags;  #define TRACE_GRAPH_PRINT_PROC          0x8  #define TRACE_GRAPH_PRINT_DURATION      0x10  #define TRACE_GRAPH_PRINT_ABS_TIME      0x20 +#define TRACE_GRAPH_PRINT_FILL_SHIFT	28 +#define TRACE_GRAPH_PRINT_FILL_MASK	(0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)  extern enum print_line_t  print_graph_function_flags(struct trace_iterator *iter, u32 flags); @@ -730,15 +734,16 @@ extern void __trace_graph_return(struct trace_array *tr,  #ifdef CONFIG_DYNAMIC_FTRACE  /* TODO: make this variable */  #define FTRACE_GRAPH_MAX_FUNCS		32 -extern int ftrace_graph_filter_enabled;  extern int ftrace_graph_count;  extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; +extern int ftrace_graph_notrace_count; +extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS];  static inline int ftrace_graph_addr(unsigned long addr)  {  	int i; -	if (!ftrace_graph_filter_enabled) +	if (!ftrace_graph_count)  		return 1;  	for (i = 0; i < ftrace_graph_count; i++) { @@ -758,11 +763,31 @@ static inline int ftrace_graph_addr(unsigned long addr)  	return 0;  } + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ +	int i; + +	if (!ftrace_graph_notrace_count) +		return 0; + +	for (i = 0; i < ftrace_graph_notrace_count; i++) { +		if (addr == ftrace_graph_notrace_funcs[i]) +			return 1; +	} + +	return 0; +}  #else  static inline int ftrace_graph_addr(unsigned long addr)  {  	return 1;  } + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ +	return 0; +}  #endif /* CONFIG_DYNAMIC_FTRACE */  #else /* CONFIG_FUNCTION_GRAPH_TRACER */  static inline enum print_line_t @@ -986,9 +1011,9 @@ struct filter_pred {  extern enum regex_type  filter_parse_regex(char *buff, int len, char **search, int *not); -extern void print_event_filter(struct ftrace_event_call *call, +extern void print_event_filter(struct ftrace_event_file *file,  			       struct trace_seq *s); -extern int apply_event_filter(struct ftrace_event_call *call, +extern int apply_event_filter(struct ftrace_event_file *file,  			      char *filter_string);  extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,  					char *filter_string); @@ -999,20 +1024,6 @@ extern int filter_assign_type(const char *type);  struct ftrace_event_field *  trace_find_event_field(struct ftrace_event_call *call, char *name); -static inline int -filter_check_discard(struct ftrace_event_call *call, void *rec, -		     struct ring_buffer *buffer, -		     struct ring_buffer_event *event) -{ -	if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && -	    !filter_match_preds(call->filter, rec)) { -		ring_buffer_discard_commit(buffer, event); -		return 1; -	} - -	return 0; -} -  extern void trace_event_enable_cmd_record(bool enable);  extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);  extern int event_trace_del_tracer(struct trace_array *tr); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index d594da0dc03c..697fb9bac8f0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -78,7 +78,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)  	entry->line = f->line;  	entry->correct = val == expect; -	if (!filter_check_discard(call, entry, buffer, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		__buffer_unlock_commit(buffer, event);   out: diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 80c36bcf66e8..78e27e3b52ac 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -26,7 +26,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,  {  	/* The ftrace function trace is allowed only for root. */  	if (ftrace_event_is_function(tp_event) && -	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) +	    perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))  		return -EPERM;  	/* No tracing, just counting, so no obvious leak */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 368a4d50cc30..f919a2e21bf3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -989,7 +989,7 @@ static ssize_t  event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,  		  loff_t *ppos)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file;  	struct trace_seq *s;  	int r = -ENODEV; @@ -1004,12 +1004,12 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,  	trace_seq_init(s);  	mutex_lock(&event_mutex); -	call = event_file_data(filp); -	if (call) -		print_event_filter(call, s); +	file = event_file_data(filp); +	if (file) +		print_event_filter(file, s);  	mutex_unlock(&event_mutex); -	if (call) +	if (file)  		r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);  	kfree(s); @@ -1021,7 +1021,7 @@ static ssize_t  event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  		   loff_t *ppos)  { -	struct ftrace_event_call *call; +	struct ftrace_event_file *file;  	char *buf;  	int err = -ENODEV; @@ -1039,9 +1039,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  	buf[cnt] = '\0';  	mutex_lock(&event_mutex); -	call = event_file_data(filp); -	if (call) -		err = apply_event_filter(call, buf); +	file = event_file_data(filp); +	if (file) +		err = apply_event_filter(file, buf);  	mutex_unlock(&event_mutex);  	free_page((unsigned long) buf); @@ -1062,6 +1062,9 @@ static int subsystem_open(struct inode *inode, struct file *filp)  	struct trace_array *tr;  	int ret; +	if (tracing_is_disabled()) +		return -ENODEV; +  	/* Make sure the system still exists */  	mutex_lock(&trace_types_lock);  	mutex_lock(&event_mutex); @@ -1108,6 +1111,9 @@ static int system_tr_open(struct inode *inode, struct file *filp)  	struct trace_array *tr = inode->i_private;  	int ret; +	if (tracing_is_disabled()) +		return -ENODEV; +  	if (trace_array_get(tr) < 0)  		return -ENODEV; @@ -1124,11 +1130,12 @@ static int system_tr_open(struct inode *inode, struct file *filp)  	if (ret < 0) {  		trace_array_put(tr);  		kfree(dir); +		return ret;  	}  	filp->private_data = dir; -	return ret; +	return 0;  }  static int subsystem_release(struct inode *inode, struct file *file) @@ -1539,7 +1546,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)  			return -1;  		}  	} -	trace_create_file("filter", 0644, file->dir, call, +	trace_create_file("filter", 0644, file->dir, file,  			  &ftrace_event_filter_fops);  	trace_create_file("format", 0444, file->dir, call, @@ -1577,6 +1584,7 @@ static void event_remove(struct ftrace_event_call *call)  		if (file->event_call != call)  			continue;  		ftrace_event_enable_disable(file, 0); +		destroy_preds(file);  		/*  		 * The do_for_each_event_file() is  		 * a double loop. After finding the call for this @@ -1700,7 +1708,7 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)  {  	event_remove(call);  	trace_destroy_fields(call); -	destroy_preds(call); +	destroy_call_preds(call);  }  static int probe_remove_event_call(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 97daa8cf958d..2468f56dc5db 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -637,10 +637,18 @@ static void append_filter_err(struct filter_parse_state *ps,  	free_page((unsigned long) buf);  } +static inline struct event_filter *event_filter(struct ftrace_event_file *file) +{ +	if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		return file->event_call->filter; +	else +		return file->filter; +} +  /* caller must hold event_mutex */ -void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) +void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s)  { -	struct event_filter *filter = call->filter; +	struct event_filter *filter = event_filter(file);  	if (filter && filter->filter_string)  		trace_seq_printf(s, "%s\n", filter->filter_string); @@ -766,11 +774,21 @@ static void __free_preds(struct event_filter *filter)  	filter->n_preds = 0;  } -static void filter_disable(struct ftrace_event_call *call) +static void call_filter_disable(struct ftrace_event_call *call)  {  	call->flags &= ~TRACE_EVENT_FL_FILTERED;  } +static void filter_disable(struct ftrace_event_file *file) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		call_filter_disable(call); +	else +		file->flags &= ~FTRACE_EVENT_FL_FILTERED; +} +  static void __free_filter(struct event_filter *filter)  {  	if (!filter) @@ -781,16 +799,30 @@ static void __free_filter(struct event_filter *filter)  	kfree(filter);  } +void destroy_call_preds(struct ftrace_event_call *call) +{ +	__free_filter(call->filter); +	call->filter = NULL; +} + +static void destroy_file_preds(struct ftrace_event_file *file) +{ +	__free_filter(file->filter); +	file->filter = NULL; +} +  /* - * Called when destroying the ftrace_event_call. - * The call is being freed, so we do not need to worry about - * the call being currently used. This is for module code removing + * Called when destroying the ftrace_event_file. + * The file is being freed, so we do not need to worry about + * the file being currently used. This is for module code removing   * the tracepoints from within it.   */ -void destroy_preds(struct ftrace_event_call *call) +void destroy_preds(struct ftrace_event_file *file)  { -	__free_filter(call->filter); -	call->filter = NULL; +	if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		destroy_call_preds(file->event_call); +	else +		destroy_file_preds(file);  }  static struct event_filter *__alloc_filter(void) @@ -825,28 +857,56 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)  	return 0;  } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static inline void __remove_filter(struct ftrace_event_file *file)  { +	struct ftrace_event_call *call = file->event_call; + +	filter_disable(file); +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		remove_filter_string(call->filter); +	else +		remove_filter_string(file->filter); +} + +static void filter_free_subsystem_preds(struct event_subsystem *system, +					struct trace_array *tr) +{ +	struct ftrace_event_file *file;  	struct ftrace_event_call *call; -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { +		call = file->event_call;  		if (strcmp(call->class->system, system->name) != 0)  			continue; -		filter_disable(call); -		remove_filter_string(call->filter); +		__remove_filter(file);  	}  } -static void filter_free_subsystem_filters(struct event_subsystem *system) +static inline void __free_subsystem_filter(struct ftrace_event_file *file)  { +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { +		__free_filter(call->filter); +		call->filter = NULL; +	} else { +		__free_filter(file->filter); +		file->filter = NULL; +	} +} + +static void filter_free_subsystem_filters(struct event_subsystem *system, +					  struct trace_array *tr) +{ +	struct ftrace_event_file *file;  	struct ftrace_event_call *call; -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) { +		call = file->event_call;  		if (strcmp(call->class->system, system->name) != 0)  			continue; -		__free_filter(call->filter); -		call->filter = NULL; +		__free_subsystem_filter(file);  	}  } @@ -1617,15 +1677,85 @@ fail:  	return err;  } +static inline void event_set_filtered_flag(struct ftrace_event_file *file) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		call->flags |= TRACE_EVENT_FL_FILTERED; +	else +		file->flags |= FTRACE_EVENT_FL_FILTERED; +} + +static inline void event_set_filter(struct ftrace_event_file *file, +				    struct event_filter *filter) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		rcu_assign_pointer(call->filter, filter); +	else +		rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct ftrace_event_file *file) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		RCU_INIT_POINTER(call->filter, NULL); +	else +		RCU_INIT_POINTER(file->filter, NULL); +} + +static inline void +event_set_no_set_filter_flag(struct ftrace_event_file *file) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; +	else +		file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline void +event_clear_no_set_filter_flag(struct ftrace_event_file *file) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) +		call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; +	else +		file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline bool +event_no_set_filter_flag(struct ftrace_event_file *file) +{ +	struct ftrace_event_call *call = file->event_call; + +	if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER) +		return true; + +	if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && +	    (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) +		return true; + +	return false; +} +  struct filter_list {  	struct list_head	list;  	struct event_filter	*filter;  };  static int replace_system_preds(struct event_subsystem *system, +				struct trace_array *tr,  				struct filter_parse_state *ps,  				char *filter_string)  { +	struct ftrace_event_file *file;  	struct ftrace_event_call *call;  	struct filter_list *filter_item;  	struct filter_list *tmp; @@ -1633,8 +1763,8 @@ static int replace_system_preds(struct event_subsystem *system,  	bool fail = true;  	int err; -	list_for_each_entry(call, &ftrace_events, list) { - +	list_for_each_entry(file, &tr->events, list) { +		call = file->event_call;  		if (strcmp(call->class->system, system->name) != 0)  			continue; @@ -1644,18 +1774,20 @@ static int replace_system_preds(struct event_subsystem *system,  		 */  		err = replace_preds(call, NULL, ps, filter_string, true);  		if (err) -			call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; +			event_set_no_set_filter_flag(file);  		else -			call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; +			event_clear_no_set_filter_flag(file);  	} -	list_for_each_entry(call, &ftrace_events, list) { +	list_for_each_entry(file, &tr->events, list) {  		struct event_filter *filter; +		call = file->event_call; +  		if (strcmp(call->class->system, system->name) != 0)  			continue; -		if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) +		if (event_no_set_filter_flag(file))  			continue;  		filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); @@ -1676,17 +1808,17 @@ static int replace_system_preds(struct event_subsystem *system,  		err = replace_preds(call, filter, ps, filter_string, false);  		if (err) { -			filter_disable(call); +			filter_disable(file);  			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);  			append_filter_err(ps, filter);  		} else -			call->flags |= TRACE_EVENT_FL_FILTERED; +			event_set_filtered_flag(file);  		/*  		 * Regardless of if this returned an error, we still  		 * replace the filter for the call.  		 */ -		filter = call->filter; -		rcu_assign_pointer(call->filter, filter_item->filter); +		filter = event_filter(file); +		event_set_filter(file, filter_item->filter);  		filter_item->filter = filter;  		fail = false; @@ -1816,6 +1948,7 @@ static int create_filter(struct ftrace_event_call *call,   * and always remembers @filter_str.   */  static int create_system_filter(struct event_subsystem *system, +				struct trace_array *tr,  				char *filter_str, struct event_filter **filterp)  {  	struct event_filter *filter = NULL; @@ -1824,7 +1957,7 @@ static int create_system_filter(struct event_subsystem *system,  	err = create_filter_start(filter_str, true, &ps, &filter);  	if (!err) { -		err = replace_system_preds(system, ps, filter_str); +		err = replace_system_preds(system, tr, ps, filter_str);  		if (!err) {  			/* System filters just show a default message */  			kfree(filter->filter_string); @@ -1840,20 +1973,25 @@ static int create_system_filter(struct event_subsystem *system,  }  /* caller must hold event_mutex */ -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +int apply_event_filter(struct ftrace_event_file *file, char *filter_string)  { +	struct ftrace_event_call *call = file->event_call;  	struct event_filter *filter;  	int err;  	if (!strcmp(strstrip(filter_string), "0")) { -		filter_disable(call); -		filter = call->filter; +		filter_disable(file); +		filter = event_filter(file); +  		if (!filter)  			return 0; -		RCU_INIT_POINTER(call->filter, NULL); + +		event_clear_filter(file); +  		/* Make sure the filter is not being used */  		synchronize_sched();  		__free_filter(filter); +  		return 0;  	} @@ -1866,14 +2004,15 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)  	 * string  	 */  	if (filter) { -		struct event_filter *tmp = call->filter; +		struct event_filter *tmp; +		tmp = event_filter(file);  		if (!err) -			call->flags |= TRACE_EVENT_FL_FILTERED; +			event_set_filtered_flag(file);  		else -			filter_disable(call); +			filter_disable(file); -		rcu_assign_pointer(call->filter, filter); +		event_set_filter(file, filter);  		if (tmp) {  			/* Make sure the call is done with the filter */ @@ -1889,6 +2028,7 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,  				 char *filter_string)  {  	struct event_subsystem *system = dir->subsystem; +	struct trace_array *tr = dir->tr;  	struct event_filter *filter;  	int err = 0; @@ -1901,18 +2041,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,  	}  	if (!strcmp(strstrip(filter_string), "0")) { -		filter_free_subsystem_preds(system); +		filter_free_subsystem_preds(system, tr);  		remove_filter_string(system->filter);  		filter = system->filter;  		system->filter = NULL;  		/* Ensure all filters are no longer used */  		synchronize_sched(); -		filter_free_subsystem_filters(system); +		filter_free_subsystem_filters(system, tr);  		__free_filter(filter);  		goto out_unlock;  	} -	err = create_system_filter(system, filter_string, &filter); +	err = create_system_filter(system, tr, filter_string, &filter);  	if (filter) {  		/*  		 * No event actually uses the system filter diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d21a74670088..7c3e3e72e2b6 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -180,7 +180,7 @@ struct ftrace_event_call __used event_##call = {			\  	.event.type		= etype,				\  	.class			= &event_class_ftrace_##call,		\  	.print_fmt		= print,				\ -	.flags			= TRACE_EVENT_FL_IGNORE_ENABLE,		\ +	.flags			= TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \  };									\  struct ftrace_event_call __used						\  __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b5c09242683d..0b99120d395c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -82,9 +82,9 @@ static struct trace_array *graph_array;   * to fill in space into DURATION column.   */  enum { -	DURATION_FILL_FULL  = -1, -	DURATION_FILL_START = -2, -	DURATION_FILL_END   = -3, +	FLAGS_FILL_FULL  = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT, +	FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT, +	FLAGS_FILL_END   = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT,  };  static enum print_line_t @@ -114,16 +114,37 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,  		return -EBUSY;  	} +	/* +	 * The curr_ret_stack is an index to ftrace return stack of +	 * current task.  Its value should be in [0, FTRACE_RETFUNC_ +	 * DEPTH) when the function graph tracer is used.  To support +	 * filtering out specific functions, it makes the index +	 * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) +	 * so when it sees a negative index the ftrace will ignore +	 * the record.  And the index gets recovered when returning +	 * from the filtered function by adding the FTRACE_NOTRACE_ +	 * DEPTH and then it'll continue to record functions normally. +	 * +	 * The curr_ret_stack is initialized to -1 and get increased +	 * in this function.  So it can be less than -1 only if it was +	 * filtered out via ftrace_graph_notrace_addr() which can be +	 * set from set_graph_notrace file in debugfs by user. +	 */ +	if (current->curr_ret_stack < -1) +		return -EBUSY; +  	calltime = trace_clock_local();  	index = ++current->curr_ret_stack; +	if (ftrace_graph_notrace_addr(func)) +		current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH;  	barrier();  	current->ret_stack[index].ret = ret;  	current->ret_stack[index].func = func;  	current->ret_stack[index].calltime = calltime;  	current->ret_stack[index].subtime = 0;  	current->ret_stack[index].fp = frame_pointer; -	*depth = index; +	*depth = current->curr_ret_stack;  	return 0;  } @@ -137,7 +158,17 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,  	index = current->curr_ret_stack; -	if (unlikely(index < 0)) { +	/* +	 * A negative index here means that it's just returned from a +	 * notrace'd function.  Recover index to get an original +	 * return address.  See ftrace_push_return_trace(). +	 * +	 * TODO: Need to check whether the stack gets corrupted. +	 */ +	if (index < 0) +		index += FTRACE_NOTRACE_DEPTH; + +	if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {  		ftrace_graph_stop();  		WARN_ON(1);  		/* Might as well panic, otherwise we have no where to go */ @@ -193,6 +224,15 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)  	trace.rettime = trace_clock_local();  	barrier();  	current->curr_ret_stack--; +	/* +	 * The curr_ret_stack can be less than -1 only if it was +	 * filtered out and it's about to return from the function. +	 * Recover the index and continue to trace normal functions. +	 */ +	if (current->curr_ret_stack < -1) { +		current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; +		return ret; +	}  	/*  	 * The trace should run after decrementing the ret counter @@ -230,7 +270,7 @@ int __trace_graph_entry(struct trace_array *tr,  		return 0;  	entry	= ring_buffer_event_data(event);  	entry->graph_ent			= *trace; -	if (!filter_current_check_discard(buffer, call, entry, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		__buffer_unlock_commit(buffer, event);  	return 1; @@ -259,10 +299,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)  	/* trace it when it is-nested-in or is a function enabled. */  	if ((!(trace->depth || ftrace_graph_addr(trace->func)) || -	     ftrace_graph_ignore_irqs()) || +	     ftrace_graph_ignore_irqs()) || (trace->depth < 0) ||  	    (max_depth && trace->depth >= max_depth))  		return 0; +	/* +	 * Do not trace a function if it's filtered by set_graph_notrace. +	 * Make the index of ret stack negative to indicate that it should +	 * ignore further functions.  But it needs its own ret stack entry +	 * to recover the original index in order to continue tracing after +	 * returning from the function. +	 */ +	if (ftrace_graph_notrace_addr(trace->func)) +		return 1; +  	local_irq_save(flags);  	cpu = raw_smp_processor_id();  	data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -335,7 +385,7 @@ void __trace_graph_return(struct trace_array *tr,  		return;  	entry	= ring_buffer_event_data(event);  	entry->ret				= *trace; -	if (!filter_current_check_discard(buffer, call, entry, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		__buffer_unlock_commit(buffer, event);  } @@ -652,7 +702,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,  	}  	/* No overhead */ -	ret = print_graph_duration(DURATION_FILL_START, s, flags); +	ret = print_graph_duration(0, s, flags | FLAGS_FILL_START);  	if (ret != TRACE_TYPE_HANDLED)  		return ret; @@ -664,7 +714,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; -	ret = print_graph_duration(DURATION_FILL_END, s, flags); +	ret = print_graph_duration(0, s, flags | FLAGS_FILL_END);  	if (ret != TRACE_TYPE_HANDLED)  		return ret; @@ -729,14 +779,14 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,  			return TRACE_TYPE_HANDLED;  	/* No real adata, just filling the column with spaces */ -	switch (duration) { -	case DURATION_FILL_FULL: +	switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { +	case FLAGS_FILL_FULL:  		ret = trace_seq_puts(s, "              |  ");  		return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; -	case DURATION_FILL_START: +	case FLAGS_FILL_START:  		ret = trace_seq_puts(s, "  ");  		return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; -	case DURATION_FILL_END: +	case FLAGS_FILL_END:  		ret = trace_seq_puts(s, " |");  		return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;  	} @@ -852,7 +902,7 @@ print_graph_entry_nested(struct trace_iterator *iter,  	}  	/* No time */ -	ret = print_graph_duration(DURATION_FILL_FULL, s, flags); +	ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);  	if (ret != TRACE_TYPE_HANDLED)  		return ret; @@ -1172,7 +1222,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,  		return TRACE_TYPE_PARTIAL_LINE;  	/* No time */ -	ret = print_graph_duration(DURATION_FILL_FULL, s, flags); +	ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL);  	if (ret != TRACE_TYPE_HANDLED)  		return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 243f6834d026..dae9541ada9e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -835,7 +835,7 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,  	entry->ip = (unsigned long)tp->rp.kp.addr;  	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); -	if (!filter_current_check_discard(buffer, call, entry, event)) +	if (!filter_check_discard(ftrace_file, entry, buffer, event))  		trace_buffer_unlock_commit_regs(buffer, event,  						irq_flags, pc, regs);  } @@ -884,7 +884,7 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,  	entry->ret_ip = (unsigned long)ri->ret_addr;  	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); -	if (!filter_current_check_discard(buffer, call, entry, event)) +	if (!filter_check_discard(ftrace_file, entry, buffer, event))  		trace_buffer_unlock_commit_regs(buffer, event,  						irq_flags, pc, regs);  } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index b3dcfb2f0fef..0abd9b863474 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -323,7 +323,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,  	entry	= ring_buffer_event_data(event);  	entry->rw			= *rw; -	if (!filter_check_discard(call, entry, buffer, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		trace_buffer_unlock_commit(buffer, event, 0, pc);  } @@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,  	entry	= ring_buffer_event_data(event);  	entry->map			= *map; -	if (!filter_check_discard(call, entry, buffer, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		trace_buffer_unlock_commit(buffer, event, 0, pc);  } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 34e7cbac0c9c..ed32284fbe32 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)  		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :  		(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :  		'.'; -	need_resched = -		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; + +	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | +				TRACE_FLAG_PREEMPT_RESCHED)) { +	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: +		need_resched = 'N'; +		break; +	case TRACE_FLAG_NEED_RESCHED: +		need_resched = 'n'; +		break; +	case TRACE_FLAG_PREEMPT_RESCHED: +		need_resched = 'p'; +		break; +	default: +		need_resched = '.'; +		break; +	} +  	hardsoft_irq =  		(hardirq && softirq) ? 'H' :  		hardirq ? 'h' : diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 4e98e3b257a3..3f34dc9b40f3 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr,  	entry->next_state		= next->state;  	entry->next_cpu	= task_cpu(next); -	if (!filter_check_discard(call, entry, buffer, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		trace_buffer_unlock_commit(buffer, event, flags, pc);  } @@ -101,7 +101,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,  	entry->next_state		= wakee->state;  	entry->next_cpu			= task_cpu(wakee); -	if (!filter_check_discard(call, entry, buffer, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		trace_buffer_unlock_commit(buffer, event, flags, pc);  } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 847f88a6194b..7af67360b330 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -43,46 +43,15 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);  /* The root directory for all stat files */  static struct dentry		*stat_dir; -/* - * Iterate through the rbtree using a post order traversal path - * to release the next node. - * It won't necessary release one at each iteration - * but it will at least advance closer to the next one - * to be released. - */ -static struct rb_node *release_next(struct tracer_stat *ts, -				    struct rb_node *node) +static void __reset_stat_session(struct stat_session *session)  { -	struct stat_node *snode; -	struct rb_node *parent = rb_parent(node); - -	if (node->rb_left) -		return node->rb_left; -	else if (node->rb_right) -		return node->rb_right; -	else { -		if (!parent) -			; -		else if (parent->rb_left == node) -			parent->rb_left = NULL; -		else -			parent->rb_right = NULL; +	struct stat_node *snode, *n; -		snode = container_of(node, struct stat_node, node); -		if (ts->stat_release) -			ts->stat_release(snode->stat); +	rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) { +		if (session->ts->stat_release) +			session->ts->stat_release(snode->stat);  		kfree(snode); - -		return parent;  	} -} - -static void __reset_stat_session(struct stat_session *session) -{ -	struct rb_node *node = session->stat_root.rb_node; - -	while (node) -		node = release_next(session->ts, node);  	session->stat_root = RB_ROOT;  } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 559329d9bd2f..e4b6d11bdf78 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -302,6 +302,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call)  static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  {  	struct trace_array *tr = data; +	struct ftrace_event_file *ftrace_file;  	struct syscall_trace_enter *entry;  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event; @@ -314,7 +315,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return; -	if (!test_bit(syscall_nr, tr->enabled_enter_syscalls)) + +	/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ +	ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); +	if (!ftrace_file) +		return; + +	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))  		return;  	sys_data = syscall_nr_to_meta(syscall_nr); @@ -336,8 +343,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  	entry->nr = syscall_nr;  	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); -	if (!filter_current_check_discard(buffer, sys_data->enter_event, -					  entry, event)) +	if (!filter_check_discard(ftrace_file, entry, buffer, event))  		trace_current_buffer_unlock_commit(buffer, event,  						   irq_flags, pc);  } @@ -345,6 +351,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)  {  	struct trace_array *tr = data; +	struct ftrace_event_file *ftrace_file;  	struct syscall_trace_exit *entry;  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event; @@ -356,7 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)  	syscall_nr = trace_get_syscall_nr(current, regs);  	if (syscall_nr < 0)  		return; -	if (!test_bit(syscall_nr, tr->enabled_exit_syscalls)) + +	/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ +	ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); +	if (!ftrace_file) +		return; + +	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags))  		return;  	sys_data = syscall_nr_to_meta(syscall_nr); @@ -377,8 +390,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)  	entry->nr = syscall_nr;  	entry->ret = syscall_get_return_value(current, regs); -	if (!filter_current_check_discard(buffer, sys_data->exit_event, -					  entry, event)) +	if (!filter_check_discard(ftrace_file, entry, buffer, event))  		trace_current_buffer_unlock_commit(buffer, event,  						   irq_flags, pc);  } @@ -397,7 +409,7 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file,  	if (!tr->sys_refcount_enter)  		ret = register_trace_sys_enter(ftrace_syscall_enter, tr);  	if (!ret) { -		set_bit(num, tr->enabled_enter_syscalls); +		rcu_assign_pointer(tr->enter_syscall_files[num], file);  		tr->sys_refcount_enter++;  	}  	mutex_unlock(&syscall_trace_lock); @@ -415,10 +427,15 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,  		return;  	mutex_lock(&syscall_trace_lock);  	tr->sys_refcount_enter--; -	clear_bit(num, tr->enabled_enter_syscalls); +	rcu_assign_pointer(tr->enter_syscall_files[num], NULL);  	if (!tr->sys_refcount_enter)  		unregister_trace_sys_enter(ftrace_syscall_enter, tr);  	mutex_unlock(&syscall_trace_lock); +	/* +	 * Callers expect the event to be completely disabled on +	 * return, so wait for current handlers to finish. +	 */ +	synchronize_sched();  }  static int reg_event_syscall_exit(struct ftrace_event_file *file, @@ -435,7 +452,7 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file,  	if (!tr->sys_refcount_exit)  		ret = register_trace_sys_exit(ftrace_syscall_exit, tr);  	if (!ret) { -		set_bit(num, tr->enabled_exit_syscalls); +		rcu_assign_pointer(tr->exit_syscall_files[num], file);  		tr->sys_refcount_exit++;  	}  	mutex_unlock(&syscall_trace_lock); @@ -453,10 +470,15 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,  		return;  	mutex_lock(&syscall_trace_lock);  	tr->sys_refcount_exit--; -	clear_bit(num, tr->enabled_exit_syscalls); +	rcu_assign_pointer(tr->exit_syscall_files[num], NULL);  	if (!tr->sys_refcount_exit)  		unregister_trace_sys_exit(ftrace_syscall_exit, tr);  	mutex_unlock(&syscall_trace_lock); +	/* +	 * Callers expect the event to be completely disabled on +	 * return, so wait for current handlers to finish. +	 */ +	synchronize_sched();  }  static int __init init_syscall_trace(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 272261b5f94f..b6dcc42ef7f5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -128,6 +128,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)  	if (is_ret)  		tu->consumer.ret_handler = uretprobe_dispatcher;  	init_trace_uprobe_filter(&tu->filter); +	tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;  	return tu;  error: @@ -561,7 +562,7 @@ static void uprobe_trace_print(struct trace_uprobe *tu,  	for (i = 0; i < tu->nr_args; i++)  		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); -	if (!filter_current_check_discard(buffer, call, entry, event)) +	if (!call_filter_check_discard(call, entry, buffer, event))  		trace_buffer_unlock_commit(buffer, event, 0, 0);  } diff --git a/kernel/up.c b/kernel/up.c index 630d72bf7e41..509403e3fbc6 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,6 +22,17 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,  }  EXPORT_SYMBOL(smp_call_function_single); +void __smp_call_function_single(int cpu, struct call_single_data *csd, +				int wait) +{ +	unsigned long flags; + +	local_irq_save(flags); +	csd->func(csd->info); +	local_irq_restore(flags); +} +EXPORT_SYMBOL(__smp_call_function_single); +  int on_each_cpu(smp_call_func_t func, void *info, int wait)  {  	unsigned long flags; diff --git a/kernel/user.c b/kernel/user.c index 5bbb91988e69..a3a0dbfda329 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,10 @@ struct user_namespace init_user_ns = {  	.owner = GLOBAL_ROOT_UID,  	.group = GLOBAL_ROOT_GID,  	.proc_inum = PROC_USER_INIT_INO, +#ifdef CONFIG_KEYS_KERBEROS_CACHE +	.krb_cache_register_sem = +	__RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem), +#endif  };  EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 13fb1134ba58..240fb62cf394 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -101,6 +101,9 @@ int create_user_ns(struct cred *new)  	set_cred_user_ns(new, ns); +#ifdef CONFIG_PERSISTENT_KEYRINGS +	init_rwsem(&ns->persistent_keyring_register_sem); +#endif  	return 0;  } @@ -130,6 +133,9 @@ void free_user_ns(struct user_namespace *ns)  	do {  		parent = ns->parent; +#ifdef CONFIG_PERSISTENT_KEYRINGS +		key_put(ns->persistent_keyring_register); +#endif  		proc_free_inum(ns->proc_inum);  		kmem_cache_free(user_ns_cachep, ns);  		ns = parent; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 987293d03ebc..c66912be990f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);  /* I: attributes used when instantiating standard unbound pools on demand */  static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; +/* I: attributes used when instantiating ordered pools on demand */ +static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; +  struct workqueue_struct *system_wq __read_mostly;  EXPORT_SYMBOL(system_wq);  struct workqueue_struct *system_highpri_wq __read_mostly; @@ -518,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { }  static inline void debug_work_deactivate(struct work_struct *work) { }  #endif -/* allocate ID and assign it to @pool */ +/** + * worker_pool_assign_id - allocate ID and assing it to @pool + * @pool: the pool pointer of interest + * + * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned + * successfully, -errno on failure. + */  static int worker_pool_assign_id(struct worker_pool *pool)  {  	int ret;  	lockdep_assert_held(&wq_pool_mutex); -	ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); +	ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, +			GFP_KERNEL);  	if (ret >= 0) {  		pool->id = ret;  		return 0; @@ -1320,7 +1330,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,  	debug_work_activate(work); -	/* if dying, only works from the same workqueue are allowed */ +	/* if draining, only works from the same workqueue are allowed */  	if (unlikely(wq->flags & __WQ_DRAINING) &&  	    WARN_ON_ONCE(!is_chained_work(wq)))  		return; @@ -1736,16 +1746,17 @@ static struct worker *create_worker(struct worker_pool *pool)  	if (IS_ERR(worker->task))  		goto fail; +	set_user_nice(worker->task, pool->attrs->nice); + +	/* prevent userland from meddling with cpumask of workqueue workers */ +	worker->task->flags |= PF_NO_SETAFFINITY; +  	/*  	 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any  	 * online CPUs.  It'll be re-applied when any of the CPUs come up.  	 */ -	set_user_nice(worker->task, pool->attrs->nice);  	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); -	/* prevent userland from meddling with cpumask of workqueue workers */ -	worker->task->flags |= PF_NO_SETAFFINITY; -  	/*  	 * The caller is responsible for ensuring %POOL_DISASSOCIATED  	 * remains stable across this function.  See the comments above the @@ -4106,7 +4117,7 @@ out_unlock:  static int alloc_and_link_pwqs(struct workqueue_struct *wq)  {  	bool highpri = wq->flags & WQ_HIGHPRI; -	int cpu; +	int cpu, ret;  	if (!(wq->flags & WQ_UNBOUND)) {  		wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); @@ -4126,6 +4137,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)  			mutex_unlock(&wq->mutex);  		}  		return 0; +	} else if (wq->flags & __WQ_ORDERED) { +		ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); +		/* there should only be single pwq for ordering guarantee */ +		WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || +			      wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), +		     "ordering guarantee broken for workqueue %s\n", wq->name); +		return ret;  	} else {  		return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);  	} @@ -5009,10 +5027,6 @@ static int __init init_workqueues(void)  	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };  	int i, cpu; -	/* make sure we have enough bits for OFFQ pool ID */ -	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < -		     WORK_CPU_END * NR_STD_WORKER_POOLS); -  	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));  	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); @@ -5051,13 +5065,23 @@ static int __init init_workqueues(void)  		}  	} -	/* create default unbound wq attrs */ +	/* create default unbound and ordered wq attrs */  	for (i = 0; i < NR_STD_WORKER_POOLS; i++) {  		struct workqueue_attrs *attrs;  		BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));  		attrs->nice = std_nice[i];  		unbound_std_wq_attrs[i] = attrs; + +		/* +		 * An ordered wq should have only one pwq as ordering is +		 * guaranteed by max_active which is enforced by pwqs. +		 * Turn off NUMA so that dfl_pwq is used for all nodes. +		 */ +		BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); +		attrs->nice = std_nice[i]; +		attrs->no_numa = true; +		ordered_wq_attrs[i] = attrs;  	}  	system_wq = alloc_workqueue("events", 0, 0);  | 
