diff options
Diffstat (limited to 'kernel')
76 files changed, 3443 insertions, 2220 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 271fd3119af9..35ef1185e359 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -2,14 +2,14 @@  # Makefile for the linux kernel.  # -obj-y     = fork.o exec_domain.o panic.o printk.o \ +obj-y     = fork.o exec_domain.o panic.o \  	    cpu.o exit.o itimer.o time.o softirq.o resource.o \  	    sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \  	    signal.o sys.o kmod.o workqueue.o pid.o task_work.o \  	    rcupdate.o extable.o params.o posix-timers.o \  	    kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \  	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ -	    notifier.o ksysfs.o cred.o \ +	    notifier.o ksysfs.o cred.o reboot.o \  	    async.o range.o groups.o lglock.o smpboot.o  ifdef CONFIG_FUNCTION_TRACER @@ -24,6 +24,7 @@ endif  obj-y += sched/  obj-y += power/ +obj-y += printk/  obj-y += cpu/  obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o diff --git a/kernel/audit.h b/kernel/audit.h index 1c95131ef760..123c9b7c3979 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -85,6 +85,7 @@ struct audit_names {  	struct filename		*name;  	int			name_len;	/* number of chars to log */ +	bool			hidden;		/* don't log this record */  	bool			name_put;	/* call __putname()? */  	unsigned long		ino; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 6bd4a90d1991..f7aee8be7fb2 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,  		f->lsm_rule = NULL;  		/* Support legacy tests for a valid loginuid */ -		if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { +		if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) {  			f->type = AUDIT_LOGINUID_SET;  			f->val = 0;  		} @@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry)  		err = audit_add_watch(&entry->rule, &list);  		if (err) {  			mutex_unlock(&audit_filter_mutex); +			/* +			 * normally audit_add_tree_rule() will free it +			 * on failure +			 */ +			if (tree) +				audit_put_tree(tree);  			goto error;  		}  	} diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3c8a601324a2..9845cb32b60a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts  	}  	i = 0; -	list_for_each_entry(n, &context->names_list, list) +	list_for_each_entry(n, &context->names_list, list) { +		if (n->hidden) +			continue;  		audit_log_name(context, n, NULL, i++, &call_panic); +	}  	/* Send end of event record to help user space know we are finished */  	ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); @@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name)   * __audit_inode - store the inode and device from a lookup   * @name: name being audited   * @dentry: dentry being audited - * @parent: does this dentry represent the parent? + * @flags: attributes for this particular entry   */  void __audit_inode(struct filename *name, const struct dentry *dentry, -		   unsigned int parent) +		   unsigned int flags)  {  	struct audit_context *context = current->audit_context;  	const struct inode *inode = dentry->d_inode;  	struct audit_names *n; +	bool parent = flags & AUDIT_INODE_PARENT;  	if (!context->in_syscall)  		return; @@ -1831,6 +1835,8 @@ out:  	if (parent) {  		n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;  		n->type = AUDIT_TYPE_PARENT; +		if (flags & AUDIT_INODE_HIDDEN) +			n->hidden = true;  	} else {  		n->name_len = AUDIT_NAME_FULL;  		n->type = AUDIT_TYPE_NORMAL; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5583d10a325..781845a013ab 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -802,7 +802,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,   */  static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);  static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,  			       unsigned long subsys_mask); @@ -1846,36 +1845,43 @@ out:  EXPORT_SYMBOL_GPL(cgroup_path);  /** - * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy + * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy   * @task: target task - * @hierarchy_id: the hierarchy to look up @task's cgroup from   * @buf: the buffer to write the path into   * @buflen: the length of the buffer   * - * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and - * copy its path into @buf.  This function grabs cgroup_mutex and shouldn't - * be used inside locks used by cgroup controller callbacks. + * Determine @task's cgroup on the first (the one with the lowest non-zero + * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This + * function grabs cgroup_mutex and shouldn't be used inside locks used by + * cgroup controller callbacks. + * + * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.   */ -int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, -				    char *buf, size_t buflen) +int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)  {  	struct cgroupfs_root *root; -	struct cgroup *cgrp = NULL; -	int ret = -ENOENT; +	struct cgroup *cgrp; +	int hierarchy_id = 1, ret = 0; + +	if (buflen < 2) +		return -ENAMETOOLONG;  	mutex_lock(&cgroup_mutex); -	root = idr_find(&cgroup_hierarchy_idr, hierarchy_id); +	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); +  	if (root) {  		cgrp = task_cgroup_from_root(task, root);  		ret = cgroup_path(cgrp, buf, buflen); +	} else { +		/* if no hierarchy exists, everyone is in "/" */ +		memcpy(buf, "/", 2);  	}  	mutex_unlock(&cgroup_mutex); -  	return ret;  } -EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); +EXPORT_SYMBOL_GPL(task_cgroup_path);  /*   * Control Group taskset @@ -2642,7 +2648,7 @@ static const struct inode_operations cgroup_file_inode_operations = {  };  static const struct inode_operations cgroup_dir_inode_operations = { -	.lookup = cgroup_lookup, +	.lookup = simple_lookup,  	.mkdir = cgroup_mkdir,  	.rmdir = cgroup_rmdir,  	.rename = cgroup_rename, @@ -2652,14 +2658,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {  	.removexattr = cgroup_removexattr,  }; -static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) -{ -	if (dentry->d_name.len > NAME_MAX) -		return ERR_PTR(-ENAMETOOLONG); -	d_add(dentry, NULL); -	return NULL; -} -  /*   * Check if a file is a control file   */ @@ -4337,8 +4335,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,  		}  		err = percpu_ref_init(&css->refcnt, css_release); -		if (err) +		if (err) { +			ss->css_free(cgrp);  			goto err_free_all; +		}  		init_cgroup_css(css, ss, cgrp); diff --git a/kernel/cpu.c b/kernel/cpu.c index 198a38883e64..b2b227b82123 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -366,7 +366,7 @@ EXPORT_SYMBOL(cpu_down);  #endif /*CONFIG_HOTPLUG_CPU*/  /* Requires cpu_add_remove_lock to be held */ -static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) +static int _cpu_up(unsigned int cpu, int tasks_frozen)  {  	int ret, nr_calls = 0;  	void *hcpu = (void *)(long)cpu; @@ -419,7 +419,7 @@ out:  	return ret;  } -int __cpuinit cpu_up(unsigned int cpu) +int cpu_up(unsigned int cpu)  {  	int err = 0; @@ -618,7 +618,7 @@ core_initcall(cpu_hotplug_pm_sync_init);   * It must be called by the arch code on the new cpu, before the new cpu   * enables interrupts and before the "boot" cpu returns from __cpu_up().   */ -void __cpuinit notify_cpu_starting(unsigned int cpu) +void notify_cpu_starting(unsigned int cpu)  {  	unsigned long val = CPU_STARTING; diff --git a/kernel/events/core.c b/kernel/events/core.c index 1db3af933704..f86599e8c123 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -182,7 +182,7 @@ void update_perf_cpu_limits(void)  	u64 tmp = perf_sample_period_ns;  	tmp *= sysctl_perf_cpu_time_max_percent; -	tmp = do_div(tmp, 100); +	do_div(tmp, 100);  	atomic_set(&perf_sample_allowed_ns, tmp);  } @@ -232,7 +232,7 @@ DEFINE_PER_CPU(u64, running_sample_length);  void perf_sample_event_took(u64 sample_len_ns)  {  	u64 avg_local_sample_len; -	u64 local_samples_len = __get_cpu_var(running_sample_length); +	u64 local_samples_len;  	if (atomic_read(&perf_sample_allowed_ns) == 0)  		return; @@ -947,8 +947,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)  {  	struct perf_event_context *ctx; -	rcu_read_lock();  retry: +	/* +	 * One of the few rules of preemptible RCU is that one cannot do +	 * rcu_read_unlock() while holding a scheduler (or nested) lock when +	 * part of the read side critical section was preemptible -- see +	 * rcu_read_unlock_special(). +	 * +	 * Since ctx->lock nests under rq->lock we must ensure the entire read +	 * side critical section is non-preemptible. +	 */ +	preempt_disable(); +	rcu_read_lock();  	ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);  	if (ctx) {  		/* @@ -964,6 +974,8 @@ retry:  		raw_spin_lock_irqsave(&ctx->lock, *flags);  		if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {  			raw_spin_unlock_irqrestore(&ctx->lock, *flags); +			rcu_read_unlock(); +			preempt_enable();  			goto retry;  		} @@ -973,6 +985,7 @@ retry:  		}  	}  	rcu_read_unlock(); +	preempt_enable();  	return ctx;  } @@ -1950,7 +1963,16 @@ static int __perf_event_enable(void *info)  	struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);  	int err; -	if (WARN_ON_ONCE(!ctx->is_active)) +	/* +	 * There's a time window between 'ctx->is_active' check +	 * in perf_event_enable function and this place having: +	 *   - IRQs on +	 *   - ctx->lock unlocked +	 * +	 * where the task could be killed and 'ctx' deactivated +	 * by perf_event_exit_task. +	 */ +	if (!ctx->is_active)  		return -EINVAL;  	raw_spin_lock(&ctx->lock); @@ -6212,8 +6234,6 @@ perf_event_mux_interval_ms_store(struct device *dev,  	return count;  } -#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) -  static struct device_attribute pmu_dev_attrs[] = {  	__ATTR_RO(type),  	__ATTR_RW(perf_event_mux_interval_ms), @@ -7465,7 +7485,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,  		 * child.  		 */ -		child_ctx = alloc_perf_context(event->pmu, child); +		child_ctx = alloc_perf_context(parent_ctx->pmu, child);  		if (!child_ctx)  			return -ENOMEM; @@ -7608,7 +7628,7 @@ static void __init perf_event_init_all_cpus(void)  	}  } -static void __cpuinit perf_event_init_cpu(int cpu) +static void perf_event_init_cpu(int cpu)  {  	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -7697,7 +7717,7 @@ static struct notifier_block perf_reboot_notifier = {  	.priority = INT_MIN,  }; -static int __cpuinit +static int  perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)  {  	unsigned int cpu = (long)hcpu; diff --git a/kernel/exit.c b/kernel/exit.c index fafe75d9e6f6..a949819055d5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -808,7 +808,7 @@ void do_exit(long code)  	/*  	 * FIXME: do that only when needed, using sched_exit tracepoint  	 */ -	ptrace_put_breakpoints(tsk); +	flush_ptrace_hw_breakpoint(tsk);  	exit_notify(tsk, group_dead);  #ifdef CONFIG_NUMA diff --git a/kernel/fork.c b/kernel/fork.c index 6e6a1c11b3e5..403d2bb8a968 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -365,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)  	mm->locked_vm = 0;  	mm->mmap = NULL;  	mm->mmap_cache = NULL; -	mm->free_area_cache = oldmm->mmap_base; -	mm->cached_hole_size = ~0UL;  	mm->map_count = 0;  	cpumask_clear(mm_cpumask(mm));  	mm->mm_rb = RB_ROOT; @@ -540,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)  	mm->nr_ptes = 0;  	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));  	spin_lock_init(&mm->page_table_lock); -	mm->free_area_cache = TASK_UNMAPPED_BASE; -	mm->cached_hole_size = ~0UL;  	mm_init_aio(mm);  	mm_init_owner(mm, p); @@ -1550,7 +1546,7 @@ static inline void init_idle_pids(struct pid_link *links)  	}  } -struct task_struct * __cpuinit fork_idle(int cpu) +struct task_struct *fork_idle(int cpu)  {  	struct task_struct *task;  	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); diff --git a/kernel/freezer.c b/kernel/freezer.c index 8b2afc1c9df0..b462fa197517 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -33,7 +33,7 @@ static DEFINE_SPINLOCK(freezer_lock);   */  bool freezing_slow_path(struct task_struct *p)  { -	if (p->flags & PF_NOFREEZE) +	if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))  		return false;  	if (pm_nosig_freezing || cgroup_freezing(p)) diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 3ee4d06c6fc2..383319bae3f7 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -722,17 +722,20 @@ static int hrtimer_switch_to_hres(void)  	return 1;  } +static void clock_was_set_work(struct work_struct *work) +{ +	clock_was_set(); +} + +static DECLARE_WORK(hrtimer_work, clock_was_set_work); +  /* - * Called from timekeeping code to reprogramm the hrtimer interrupt - * device. If called from the timer interrupt context we defer it to - * softirq context. + * Called from timekeeping and resume code to reprogramm the hrtimer + * interrupt device on all cpus.   */  void clock_was_set_delayed(void)  { -	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - -	cpu_base->clock_was_set = 1; -	__raise_softirq_irqoff(HRTIMER_SOFTIRQ); +	schedule_work(&hrtimer_work);  }  #else @@ -774,15 +777,19 @@ void clock_was_set(void)  /*   * During resume we might have to reprogram the high resolution timer - * interrupt (on the local CPU): + * interrupt on all online CPUs.  However, all other CPUs will be + * stopped with IRQs interrupts disabled so the clock_was_set() call + * must be deferred.   */  void hrtimers_resume(void)  {  	WARN_ONCE(!irqs_disabled(),  		  KERN_INFO "hrtimers_resume() called with IRQs enabled!"); +	/* Retrigger on the local CPU */  	retrigger_next_event(NULL); -	timerfd_clock_was_set(); +	/* And schedule a retrigger for all others */ +	clock_was_set_delayed();  }  static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) @@ -1433,13 +1440,6 @@ void hrtimer_peek_ahead_timers(void)  static void run_hrtimer_softirq(struct softirq_action *h)  { -	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - -	if (cpu_base->clock_was_set) { -		cpu_base->clock_was_set = 0; -		clock_was_set(); -	} -  	hrtimer_peek_ahead_timers();  } @@ -1659,7 +1659,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,  /*   * Functions related to boot-time initialization:   */ -static void __cpuinit init_hrtimers_cpu(int cpu) +static void init_hrtimers_cpu(int cpu)  {  	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);  	int i; @@ -1740,7 +1740,7 @@ static void migrate_hrtimers(int scpu)  #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, +static int hrtimer_cpu_notify(struct notifier_block *self,  					unsigned long action, void *hcpu)  {  	int scpu = (long)hcpu; @@ -1773,7 +1773,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,  	return NOTIFY_OK;  } -static struct notifier_block __cpuinitdata hrtimers_nb = { +static struct notifier_block hrtimers_nb = {  	.notifier_call = hrtimer_cpu_notify,  }; diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 1c39eccc1eaf..452d6f2ba21d 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -135,7 +135,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d)  }  /** - * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt + * irq_gc_mask_disable_reg_and_ack - Mask and ack pending interrupt   * @d: irq_data   */  void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) @@ -275,10 +275,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,  	if (d->gc)  		return -EBUSY; -	if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) -		return -EINVAL; - -	numchips = d->revmap_data.linear.size / irqs_per_chip; +	numchips = DIV_ROUND_UP(d->revmap_size, irqs_per_chip);  	if (!numchips)  		return -EINVAL; @@ -310,6 +307,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,  		/* Calc pointer to the next generic chip */  		tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);  	} +	d->name = name;  	return 0;  }  EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1ed8dff17eb9..706724e9835d 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,9 +23,11 @@ static DEFINE_MUTEX(revmap_trees_mutex);  static struct irq_domain *irq_default_domain;  /** - * irq_domain_alloc() - Allocate a new irq_domain data structure + * __irq_domain_add() - Allocate a new irq_domain data structure   * @of_node: optional device-tree node of the interrupt controller - * @revmap_type: type of reverse mapping to use + * @size: Size of linear map; 0 for radix mapping only + * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no + *              direct mapping   * @ops: map/unmap domain callbacks   * @host_data: Controller private data pointer   * @@ -33,41 +35,35 @@ static struct irq_domain *irq_default_domain;   * register allocated irq_domain with irq_domain_register().  Returns pointer   * to IRQ domain, or NULL on failure.   */ -static struct irq_domain *irq_domain_alloc(struct device_node *of_node, -					   unsigned int revmap_type, -					   const struct irq_domain_ops *ops, -					   void *host_data) +struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, +				    irq_hw_number_t hwirq_max, int direct_max, +				    const struct irq_domain_ops *ops, +				    void *host_data)  {  	struct irq_domain *domain; -	domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, -			      of_node_to_nid(of_node)); +	domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), +			      GFP_KERNEL, of_node_to_nid(of_node));  	if (WARN_ON(!domain))  		return NULL;  	/* Fill structure */ -	domain->revmap_type = revmap_type; +	INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);  	domain->ops = ops;  	domain->host_data = host_data;  	domain->of_node = of_node_get(of_node); +	domain->hwirq_max = hwirq_max; +	domain->revmap_size = size; +	domain->revmap_direct_max_irq = direct_max; -	return domain; -} - -static void irq_domain_free(struct irq_domain *domain) -{ -	of_node_put(domain->of_node); -	kfree(domain); -} - -static void irq_domain_add(struct irq_domain *domain) -{  	mutex_lock(&irq_domain_mutex);  	list_add(&domain->link, &irq_domain_list);  	mutex_unlock(&irq_domain_mutex); -	pr_debug("Allocated domain of type %d @0x%p\n", -		 domain->revmap_type, domain); + +	pr_debug("Added domain %s\n", domain->name); +	return domain;  } +EXPORT_SYMBOL_GPL(__irq_domain_add);  /**   * irq_domain_remove() - Remove an irq domain. @@ -81,29 +77,12 @@ void irq_domain_remove(struct irq_domain *domain)  {  	mutex_lock(&irq_domain_mutex); -	switch (domain->revmap_type) { -	case IRQ_DOMAIN_MAP_LEGACY: -		/* -		 * Legacy domains don't manage their own irq_desc -		 * allocations, we expect the caller to handle irq_desc -		 * freeing on their own. -		 */ -		break; -	case IRQ_DOMAIN_MAP_TREE: -		/* -		 * radix_tree_delete() takes care of destroying the root -		 * node when all entries are removed. Shout if there are -		 * any mappings left. -		 */ -		WARN_ON(domain->revmap_data.tree.height); -		break; -	case IRQ_DOMAIN_MAP_LINEAR: -		kfree(domain->revmap_data.linear.revmap); -		domain->revmap_data.linear.size = 0; -		break; -	case IRQ_DOMAIN_MAP_NOMAP: -		break; -	} +	/* +	 * radix_tree_delete() takes care of destroying the root +	 * node when all entries are removed. Shout if there are +	 * any mappings left. +	 */ +	WARN_ON(domain->revmap_tree.height);  	list_del(&domain->link); @@ -115,44 +94,30 @@ void irq_domain_remove(struct irq_domain *domain)  	mutex_unlock(&irq_domain_mutex); -	pr_debug("Removed domain of type %d @0x%p\n", -		 domain->revmap_type, domain); +	pr_debug("Removed domain %s\n", domain->name); -	irq_domain_free(domain); +	of_node_put(domain->of_node); +	kfree(domain);  }  EXPORT_SYMBOL_GPL(irq_domain_remove); -static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, -					     irq_hw_number_t hwirq) -{ -	irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; -	int size = domain->revmap_data.legacy.size; - -	if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) -		return 0; -	return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; -} -  /** - * irq_domain_add_simple() - Allocate and register a simple irq_domain. + * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs   * @of_node: pointer to interrupt controller's device tree node.   * @size: total number of irqs in mapping   * @first_irq: first number of irq block assigned to the domain, - *	pass zero to assign irqs on-the-fly. This will result in a - *	linear IRQ domain so it is important to use irq_create_mapping() - *	for each used IRQ, especially when SPARSE_IRQ is enabled. + *	pass zero to assign irqs on-the-fly. If first_irq is non-zero, then + *	pre-map all of the irqs in the domain to virqs starting at first_irq.   * @ops: map/unmap domain callbacks   * @host_data: Controller private data pointer   * - * Allocates a legacy irq_domain if irq_base is positive or a linear - * domain otherwise. For the legacy domain, IRQ descriptors will also - * be allocated. + * Allocates an irq_domain, and optionally if first_irq is positive then also + * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq.   *   * This is intended to implement the expected behaviour for most - * interrupt controllers which is that a linear mapping should - * normally be used unless the system requires a legacy mapping in - * order to support supplying interrupt numbers during non-DT - * registration of devices. + * interrupt controllers. If device tree is used, then first_irq will be 0 and + * irqs get mapped dynamically on the fly. However, if the controller requires + * static virq assignments (non-DT boot) then it will set that up correctly.   */  struct irq_domain *irq_domain_add_simple(struct device_node *of_node,  					 unsigned int size, @@ -160,33 +125,25 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,  					 const struct irq_domain_ops *ops,  					 void *host_data)  { -	if (first_irq > 0) { -		int irq_base; +	struct irq_domain *domain; + +	domain = __irq_domain_add(of_node, size, size, 0, ops, host_data); +	if (!domain) +		return NULL; +	if (first_irq > 0) {  		if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { -			/* -			 * Set the descriptor allocator to search for a -			 * 1-to-1 mapping, such as irq_alloc_desc_at(). -			 * Use of_node_to_nid() which is defined to -			 * numa_node_id() on platforms that have no custom -			 * implementation. -			 */ -			irq_base = irq_alloc_descs(first_irq, first_irq, size, -						   of_node_to_nid(of_node)); -			if (irq_base < 0) { +			/* attempt to allocated irq_descs */ +			int rc = irq_alloc_descs(first_irq, first_irq, size, +						 of_node_to_nid(of_node)); +			if (rc < 0)  				pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",  					first_irq); -				irq_base = first_irq; -			} -		} else -			irq_base = first_irq; - -		return irq_domain_add_legacy(of_node, size, irq_base, 0, -					     ops, host_data); +		} +		irq_domain_associate_many(domain, first_irq, 0, size);  	} -	/* A linear domain is the default */ -	return irq_domain_add_linear(of_node, size, ops, host_data); +	return domain;  }  EXPORT_SYMBOL_GPL(irq_domain_add_simple); @@ -213,131 +170,19 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,  					 void *host_data)  {  	struct irq_domain *domain; -	unsigned int i; -	domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); +	domain = __irq_domain_add(of_node, first_hwirq + size, +				  first_hwirq + size, 0, ops, host_data);  	if (!domain)  		return NULL; -	domain->revmap_data.legacy.first_irq = first_irq; -	domain->revmap_data.legacy.first_hwirq = first_hwirq; -	domain->revmap_data.legacy.size = size; - -	mutex_lock(&irq_domain_mutex); -	/* Verify that all the irqs are available */ -	for (i = 0; i < size; i++) { -		int irq = first_irq + i; -		struct irq_data *irq_data = irq_get_irq_data(irq); - -		if (WARN_ON(!irq_data || irq_data->domain)) { -			mutex_unlock(&irq_domain_mutex); -			irq_domain_free(domain); -			return NULL; -		} -	} - -	/* Claim all of the irqs before registering a legacy domain */ -	for (i = 0; i < size; i++) { -		struct irq_data *irq_data = irq_get_irq_data(first_irq + i); -		irq_data->hwirq = first_hwirq + i; -		irq_data->domain = domain; -	} -	mutex_unlock(&irq_domain_mutex); - -	for (i = 0; i < size; i++) { -		int irq = first_irq + i; -		int hwirq = first_hwirq + i; - -		/* IRQ0 gets ignored */ -		if (!irq) -			continue; - -		/* Legacy flags are left to default at this point, -		 * one can then use irq_create_mapping() to -		 * explicitly change them -		 */ -		if (ops->map) -			ops->map(domain, irq, hwirq); - -		/* Clear norequest flags */ -		irq_clear_status_flags(irq, IRQ_NOREQUEST); -	} +	irq_domain_associate_many(domain, first_irq, first_hwirq, size); -	irq_domain_add(domain);  	return domain;  }  EXPORT_SYMBOL_GPL(irq_domain_add_legacy);  /** - * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. - * @of_node: pointer to interrupt controller's device tree node. - * @size: Number of interrupts in the domain. - * @ops: map/unmap domain callbacks - * @host_data: Controller private data pointer - */ -struct irq_domain *irq_domain_add_linear(struct device_node *of_node, -					 unsigned int size, -					 const struct irq_domain_ops *ops, -					 void *host_data) -{ -	struct irq_domain *domain; -	unsigned int *revmap; - -	revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL, -			      of_node_to_nid(of_node)); -	if (WARN_ON(!revmap)) -		return NULL; - -	domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); -	if (!domain) { -		kfree(revmap); -		return NULL; -	} -	domain->revmap_data.linear.size = size; -	domain->revmap_data.linear.revmap = revmap; -	irq_domain_add(domain); -	return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_linear); - -struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, -					 unsigned int max_irq, -					 const struct irq_domain_ops *ops, -					 void *host_data) -{ -	struct irq_domain *domain = irq_domain_alloc(of_node, -					IRQ_DOMAIN_MAP_NOMAP, ops, host_data); -	if (domain) { -		domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0; -		irq_domain_add(domain); -	} -	return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_nomap); - -/** - * irq_domain_add_tree() - * @of_node: pointer to interrupt controller's device tree node. - * @ops: map/unmap domain callbacks - * - * Note: The radix tree will be allocated later during boot automatically - * (the reverse mapping will use the slow path until that happens). - */ -struct irq_domain *irq_domain_add_tree(struct device_node *of_node, -					 const struct irq_domain_ops *ops, -					 void *host_data) -{ -	struct irq_domain *domain = irq_domain_alloc(of_node, -					IRQ_DOMAIN_MAP_TREE, ops, host_data); -	if (domain) { -		INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); -		irq_domain_add(domain); -	} -	return domain; -} -EXPORT_SYMBOL_GPL(irq_domain_add_tree); - -/**   * irq_find_host() - Locates a domain for a given device node   * @node: device-tree node of the interrupt controller   */ @@ -385,125 +230,108 @@ void irq_set_default_host(struct irq_domain *domain)  }  EXPORT_SYMBOL_GPL(irq_set_default_host); -static void irq_domain_disassociate_many(struct irq_domain *domain, -					 unsigned int irq_base, int count) +static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)  { -	/* -	 * disassociate in reverse order; -	 * not strictly necessary, but nice for unwinding -	 */ -	while (count--) { -		int irq = irq_base + count; -		struct irq_data *irq_data = irq_get_irq_data(irq); -		irq_hw_number_t hwirq; +	struct irq_data *irq_data = irq_get_irq_data(irq); +	irq_hw_number_t hwirq; -		if (WARN_ON(!irq_data || irq_data->domain != domain)) -			continue; +	if (WARN(!irq_data || irq_data->domain != domain, +		 "virq%i doesn't exist; cannot disassociate\n", irq)) +		return; -		hwirq = irq_data->hwirq; -		irq_set_status_flags(irq, IRQ_NOREQUEST); +	hwirq = irq_data->hwirq; +	irq_set_status_flags(irq, IRQ_NOREQUEST); -		/* remove chip and handler */ -		irq_set_chip_and_handler(irq, NULL, NULL); +	/* remove chip and handler */ +	irq_set_chip_and_handler(irq, NULL, NULL); -		/* Make sure it's completed */ -		synchronize_irq(irq); +	/* Make sure it's completed */ +	synchronize_irq(irq); -		/* Tell the PIC about it */ -		if (domain->ops->unmap) -			domain->ops->unmap(domain, irq); -		smp_mb(); +	/* Tell the PIC about it */ +	if (domain->ops->unmap) +		domain->ops->unmap(domain, irq); +	smp_mb(); -		irq_data->domain = NULL; -		irq_data->hwirq = 0; +	irq_data->domain = NULL; +	irq_data->hwirq = 0; -		/* Clear reverse map */ -		switch(domain->revmap_type) { -		case IRQ_DOMAIN_MAP_LINEAR: -			if (hwirq < domain->revmap_data.linear.size) -				domain->revmap_data.linear.revmap[hwirq] = 0; -			break; -		case IRQ_DOMAIN_MAP_TREE: -			mutex_lock(&revmap_trees_mutex); -			radix_tree_delete(&domain->revmap_data.tree, hwirq); -			mutex_unlock(&revmap_trees_mutex); -			break; -		} +	/* Clear reverse map for this hwirq */ +	if (hwirq < domain->revmap_size) { +		domain->linear_revmap[hwirq] = 0; +	} else { +		mutex_lock(&revmap_trees_mutex); +		radix_tree_delete(&domain->revmap_tree, hwirq); +		mutex_unlock(&revmap_trees_mutex);  	}  } -int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, -			      irq_hw_number_t hwirq_base, int count) +int irq_domain_associate(struct irq_domain *domain, unsigned int virq, +			 irq_hw_number_t hwirq)  { -	unsigned int virq = irq_base; -	irq_hw_number_t hwirq = hwirq_base; -	int i, ret; +	struct irq_data *irq_data = irq_get_irq_data(virq); +	int ret; -	pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, -		of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); +	if (WARN(hwirq >= domain->hwirq_max, +		 "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name)) +		return -EINVAL; +	if (WARN(!irq_data, "error: virq%i is not allocated", virq)) +		return -EINVAL; +	if (WARN(irq_data->domain, "error: virq%i is already associated", virq)) +		return -EINVAL; -	for (i = 0; i < count; i++) { -		struct irq_data *irq_data = irq_get_irq_data(virq + i); - -		if (WARN(!irq_data, "error: irq_desc not allocated; " -			 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) -			return -EINVAL; -		if (WARN(irq_data->domain, "error: irq_desc already associated; " -			 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) -			return -EINVAL; -	}; - -	for (i = 0; i < count; i++, virq++, hwirq++) { -		struct irq_data *irq_data = irq_get_irq_data(virq); - -		irq_data->hwirq = hwirq; -		irq_data->domain = domain; -		if (domain->ops->map) { -			ret = domain->ops->map(domain, virq, hwirq); -			if (ret != 0) { -				/* -				 * If map() returns -EPERM, this interrupt is protected -				 * by the firmware or some other service and shall not -				 * be mapped. -				 * -				 * Since on some platforms we blindly try to map everything -				 * we end up with a log full of backtraces. -				 * -				 * So instead, we silently fail on -EPERM, it is the -				 * responsibility of the PIC driver to display a relevant -				 * message if needed. -				 */ -				if (ret != -EPERM) { -					pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", -					       virq, hwirq, ret); -					WARN_ON(1); -				} -				irq_data->domain = NULL; -				irq_data->hwirq = 0; -				goto err_unmap; +	mutex_lock(&irq_domain_mutex); +	irq_data->hwirq = hwirq; +	irq_data->domain = domain; +	if (domain->ops->map) { +		ret = domain->ops->map(domain, virq, hwirq); +		if (ret != 0) { +			/* +			 * If map() returns -EPERM, this interrupt is protected +			 * by the firmware or some other service and shall not +			 * be mapped. Don't bother telling the user about it. +			 */ +			if (ret != -EPERM) { +				pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n", +				       domain->name, hwirq, virq, ret);  			} +			irq_data->domain = NULL; +			irq_data->hwirq = 0; +			mutex_unlock(&irq_domain_mutex); +			return ret;  		} -		switch (domain->revmap_type) { -		case IRQ_DOMAIN_MAP_LINEAR: -			if (hwirq < domain->revmap_data.linear.size) -				domain->revmap_data.linear.revmap[hwirq] = virq; -			break; -		case IRQ_DOMAIN_MAP_TREE: -			mutex_lock(&revmap_trees_mutex); -			radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); -			mutex_unlock(&revmap_trees_mutex); -			break; -		} +		/* If not already assigned, give the domain the chip's name */ +		if (!domain->name && irq_data->chip) +			domain->name = irq_data->chip->name; +	} -		irq_clear_status_flags(virq, IRQ_NOREQUEST); +	if (hwirq < domain->revmap_size) { +		domain->linear_revmap[hwirq] = virq; +	} else { +		mutex_lock(&revmap_trees_mutex); +		radix_tree_insert(&domain->revmap_tree, hwirq, irq_data); +		mutex_unlock(&revmap_trees_mutex);  	} +	mutex_unlock(&irq_domain_mutex); + +	irq_clear_status_flags(virq, IRQ_NOREQUEST);  	return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_associate); - err_unmap: -	irq_domain_disassociate_many(domain, irq_base, i); -	return -EINVAL; +void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, +			       irq_hw_number_t hwirq_base, int count) +{ +	int i; + +	pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, +		of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); + +	for (i = 0; i < count; i++) { +		irq_domain_associate(domain, irq_base + i, hwirq_base + i); +	}  }  EXPORT_SYMBOL_GPL(irq_domain_associate_many); @@ -513,7 +341,9 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many);   *   * This routine is used for irq controllers which can choose the hardware   * interrupt numbers they generate. In such a case it's simplest to use - * the linux irq as the hardware interrupt number. + * the linux irq as the hardware interrupt number. It still uses the linear + * or radix tree to store the mapping, but the irq controller can optimize + * the revmap path by using the hwirq directly.   */  unsigned int irq_create_direct_mapping(struct irq_domain *domain)  { @@ -522,17 +352,14 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)  	if (domain == NULL)  		domain = irq_default_domain; -	if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP)) -		return 0; -  	virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));  	if (!virq) {  		pr_debug("create_direct virq allocation failed\n");  		return 0;  	} -	if (virq >= domain->revmap_data.nomap.max_irq) { +	if (virq >= domain->revmap_direct_max_irq) {  		pr_err("ERROR: no free irqs available below %i maximum\n", -			domain->revmap_data.nomap.max_irq); +			domain->revmap_direct_max_irq);  		irq_free_desc(virq);  		return 0;  	} @@ -569,9 +396,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,  	if (domain == NULL)  		domain = irq_default_domain;  	if (domain == NULL) { -		pr_warning("irq_create_mapping called for" -			   " NULL domain, hwirq=%lx\n", hwirq); -		WARN_ON(1); +		WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);  		return 0;  	}  	pr_debug("-> using domain @%p\n", domain); @@ -583,10 +408,6 @@ unsigned int irq_create_mapping(struct irq_domain *domain,  		return virq;  	} -	/* Get a virtual interrupt number */ -	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) -		return irq_domain_legacy_revmap(domain, hwirq); -  	/* Allocate a virtual interrupt number */  	hint = hwirq % nr_irqs;  	if (hint == 0) @@ -639,12 +460,7 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,  	if (unlikely(ret < 0))  		return ret; -	ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count); -	if (unlikely(ret < 0)) { -		irq_free_descs(irq_base, count); -		return ret; -	} - +	irq_domain_associate_many(domain, irq_base, hwirq_base, count);  	return 0;  }  EXPORT_SYMBOL_GPL(irq_create_strict_mappings); @@ -659,20 +475,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller,  	domain = controller ? irq_find_host(controller) : irq_default_domain;  	if (!domain) { -#ifdef CONFIG_MIPS -		/* -		 * Workaround to avoid breaking interrupt controller drivers -		 * that don't yet register an irq_domain.  This is temporary -		 * code. ~~~gcl, Feb 24, 2012 -		 * -		 * Scheduled for removal in Linux v3.6.  That should be enough -		 * time. -		 */ -		if (intsize > 0) -			return intspec[0]; -#endif -		pr_warning("no irq domain found for %s !\n", -			   of_node_full_name(controller)); +		pr_warn("no irq domain found for %s !\n", +			of_node_full_name(controller));  		return 0;  	} @@ -714,11 +518,7 @@ void irq_dispose_mapping(unsigned int virq)  	if (WARN_ON(domain == NULL))  		return; -	/* Never unmap legacy interrupts */ -	if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) -		return; - -	irq_domain_disassociate_many(domain, virq, 1); +	irq_domain_disassociate(domain, virq);  	irq_free_desc(virq);  }  EXPORT_SYMBOL_GPL(irq_dispose_mapping); @@ -739,63 +539,51 @@ unsigned int irq_find_mapping(struct irq_domain *domain,  	if (domain == NULL)  		return 0; -	switch (domain->revmap_type) { -	case IRQ_DOMAIN_MAP_LEGACY: -		return irq_domain_legacy_revmap(domain, hwirq); -	case IRQ_DOMAIN_MAP_LINEAR: -		return irq_linear_revmap(domain, hwirq); -	case IRQ_DOMAIN_MAP_TREE: -		rcu_read_lock(); -		data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); -		rcu_read_unlock(); -		if (data) -			return data->irq; -		break; -	case IRQ_DOMAIN_MAP_NOMAP: +	if (hwirq < domain->revmap_direct_max_irq) {  		data = irq_get_irq_data(hwirq);  		if (data && (data->domain == domain) && (data->hwirq == hwirq))  			return hwirq; -		break;  	} -	return 0; -} -EXPORT_SYMBOL_GPL(irq_find_mapping); +	/* Check if the hwirq is in the linear revmap. */ +	if (hwirq < domain->revmap_size) +		return domain->linear_revmap[hwirq]; -/** - * irq_linear_revmap() - Find a linux irq from a hw irq number. - * @domain: domain owning this hardware interrupt - * @hwirq: hardware irq number in that domain space - * - * This is a fast path that can be called directly by irq controller code to - * save a handful of instructions. - */ -unsigned int irq_linear_revmap(struct irq_domain *domain, -			       irq_hw_number_t hwirq) -{ -	BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR); - -	/* Check revmap bounds; complain if exceeded */ -	if (WARN_ON(hwirq >= domain->revmap_data.linear.size)) -		return 0; - -	return domain->revmap_data.linear.revmap[hwirq]; +	rcu_read_lock(); +	data = radix_tree_lookup(&domain->revmap_tree, hwirq); +	rcu_read_unlock(); +	return data ? data->irq : 0;  } -EXPORT_SYMBOL_GPL(irq_linear_revmap); +EXPORT_SYMBOL_GPL(irq_find_mapping);  #ifdef CONFIG_IRQ_DOMAIN_DEBUG  static int virq_debug_show(struct seq_file *m, void *private)  {  	unsigned long flags;  	struct irq_desc *desc; -	const char *p; -	static const char none[] = "none"; -	void *data; +	struct irq_domain *domain; +	struct radix_tree_iter iter; +	void *data, **slot;  	int i; -	seq_printf(m, "%-5s  %-7s  %-15s  %-*s  %s\n", "irq", "hwirq", +	seq_printf(m, " %-16s  %-6s  %-10s  %-10s  %s\n", +		   "name", "mapped", "linear-max", "direct-max", "devtree-node"); +	mutex_lock(&irq_domain_mutex); +	list_for_each_entry(domain, &irq_domain_list, link) { +		int count = 0; +		radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) +			count++; +		seq_printf(m, "%c%-16s  %6u  %10u  %10u  %s\n", +			   domain == irq_default_domain ? '*' : ' ', domain->name, +			   domain->revmap_size + count, domain->revmap_size, +			   domain->revmap_direct_max_irq, +			   domain->of_node ? of_node_full_name(domain->of_node) : ""); +	} +	mutex_unlock(&irq_domain_mutex); + +	seq_printf(m, "%-5s  %-7s  %-15s  %-*s  %6s  %-14s  %s\n", "irq", "hwirq",  		      "chip name", (int)(2 * sizeof(void *) + 2), "chip data", -		      "domain name"); +		      "active", "type", "domain");  	for (i = 1; i < nr_irqs; i++) {  		desc = irq_to_desc(i); @@ -803,28 +591,28 @@ static int virq_debug_show(struct seq_file *m, void *private)  			continue;  		raw_spin_lock_irqsave(&desc->lock, flags); +		domain = desc->irq_data.domain; -		if (desc->action && desc->action->handler) { +		if (domain) {  			struct irq_chip *chip; +			int hwirq = desc->irq_data.hwirq; +			bool direct;  			seq_printf(m, "%5d  ", i); -			seq_printf(m, "0x%05lx  ", desc->irq_data.hwirq); +			seq_printf(m, "0x%05x  ", hwirq);  			chip = irq_desc_get_chip(desc); -			if (chip && chip->name) -				p = chip->name; -			else -				p = none; -			seq_printf(m, "%-15s  ", p); +			seq_printf(m, "%-15s  ", (chip && chip->name) ? chip->name : "none");  			data = irq_desc_get_chip_data(desc);  			seq_printf(m, data ? "0x%p  " : "  %p  ", data); -			if (desc->irq_data.domain) -				p = of_node_full_name(desc->irq_data.domain->of_node); -			else -				p = none; -			seq_printf(m, "%s\n", p); +			seq_printf(m, "   %c    ", (desc->action && desc->action->handler) ? '*' : ' '); +			direct = (i == hwirq) && (i < domain->revmap_direct_max_irq); +			seq_printf(m, "%6s%-8s  ", +				   (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", +				   direct ? "(DIRECT)" : ""); +			seq_printf(m, "%s\n", desc->irq_data.domain->name);  		}  		raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -921,18 +709,3 @@ const struct irq_domain_ops irq_domain_simple_ops = {  	.xlate = irq_domain_xlate_onetwocell,  };  EXPORT_SYMBOL_GPL(irq_domain_simple_ops); - -#ifdef CONFIG_OF_IRQ -void irq_domain_generate_simple(const struct of_device_id *match, -				u64 phys_base, unsigned int irq_start) -{ -	struct device_node *node; -	pr_debug("looking for phys_base=%llx, irq_start=%i\n", -		(unsigned long long) phys_base, (int) irq_start); -	node = of_find_matching_node_by_address(NULL, match, phys_base); -	if (node) -		irq_domain_add_legacy(node, 32, irq_start, 0, -				      &irq_domain_simple_ops, NULL); -} -EXPORT_SYMBOL_GPL(irq_domain_generate_simple); -#endif diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 19ed5c425c3b..36f6ee181b0c 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -462,6 +462,8 @@ int show_interrupts(struct seq_file *p, void *v)  	} else {  		seq_printf(p, " %8s", "None");  	} +	if (desc->irq_data.domain) +		seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);  #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL  	seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");  #endif diff --git a/kernel/module.c b/kernel/module.c index cab4bce49c23..206915830d29 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -455,7 +455,7 @@ const struct kernel_symbol *find_symbol(const char *name,  EXPORT_SYMBOL_GPL(find_symbol);  /* Search for module by name: must hold module_mutex. */ -static struct module *find_module_all(const char *name, +static struct module *find_module_all(const char *name, size_t len,  				      bool even_unformed)  {  	struct module *mod; @@ -463,7 +463,7 @@ static struct module *find_module_all(const char *name,  	list_for_each_entry(mod, &modules, list) {  		if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)  			continue; -		if (strcmp(mod->name, name) == 0) +		if (strlen(mod->name) == len && !memcmp(mod->name, name, len))  			return mod;  	}  	return NULL; @@ -471,7 +471,7 @@ static struct module *find_module_all(const char *name,  struct module *find_module(const char *name)  { -	return find_module_all(name, false); +	return find_module_all(name, strlen(name), false);  }  EXPORT_SYMBOL_GPL(find_module); @@ -482,23 +482,28 @@ static inline void __percpu *mod_percpu(struct module *mod)  	return mod->percpu;  } -static int percpu_modalloc(struct module *mod, -			   unsigned long size, unsigned long align) +static int percpu_modalloc(struct module *mod, struct load_info *info)  { +	Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu]; +	unsigned long align = pcpusec->sh_addralign; + +	if (!pcpusec->sh_size) +		return 0; +  	if (align > PAGE_SIZE) {  		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",  		       mod->name, align, PAGE_SIZE);  		align = PAGE_SIZE;  	} -	mod->percpu = __alloc_reserved_percpu(size, align); +	mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);  	if (!mod->percpu) {  		printk(KERN_WARNING  		       "%s: Could not allocate %lu bytes percpu data\n", -		       mod->name, size); +		       mod->name, (unsigned long)pcpusec->sh_size);  		return -ENOMEM;  	} -	mod->percpu_size = size; +	mod->percpu_size = pcpusec->sh_size;  	return 0;  } @@ -563,10 +568,12 @@ static inline void __percpu *mod_percpu(struct module *mod)  {  	return NULL;  } -static inline int percpu_modalloc(struct module *mod, -				  unsigned long size, unsigned long align) +static int percpu_modalloc(struct module *mod, struct load_info *info)  { -	return -ENOMEM; +	/* UP modules shouldn't have this section: ENOMEM isn't quite right */ +	if (info->sechdrs[info->index.pcpu].sh_size != 0) +		return -ENOMEM; +	return 0;  }  static inline void percpu_modfree(struct module *mod)  { @@ -2927,7 +2934,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)  {  	/* Module within temporary copy. */  	struct module *mod; -	Elf_Shdr *pcpusec;  	int err;  	mod = setup_load_info(info, flags); @@ -2942,17 +2948,10 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)  	err = module_frob_arch_sections(info->hdr, info->sechdrs,  					info->secstrings, mod);  	if (err < 0) -		goto out; +		return ERR_PTR(err); -	pcpusec = &info->sechdrs[info->index.pcpu]; -	if (pcpusec->sh_size) { -		/* We have a special allocation for this section. */ -		err = percpu_modalloc(mod, -				      pcpusec->sh_size, pcpusec->sh_addralign); -		if (err) -			goto out; -		pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; -	} +	/* We will do a special allocation for per-cpu sections later. */ +	info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;  	/* Determine total sizes, and put offsets in sh_entsize.  For now  	   this is done generically; there doesn't appear to be any @@ -2963,17 +2962,12 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)  	/* Allocate and move to the final place */  	err = move_module(mod, info);  	if (err) -		goto free_percpu; +		return ERR_PTR(err);  	/* Module has been copied to its final place now: return it. */  	mod = (void *)info->sechdrs[info->index.mod].sh_addr;  	kmemleak_load_module(mod, info);  	return mod; - -free_percpu: -	percpu_modfree(mod); -out: -	return ERR_PTR(err);  }  /* mod is no longer valid after this! */ @@ -3014,7 +3008,7 @@ static bool finished_loading(const char *name)  	bool ret;  	mutex_lock(&module_mutex); -	mod = find_module_all(name, true); +	mod = find_module_all(name, strlen(name), true);  	ret = !mod || mod->state == MODULE_STATE_LIVE  		|| mod->state == MODULE_STATE_GOING;  	mutex_unlock(&module_mutex); @@ -3152,7 +3146,8 @@ static int add_unformed_module(struct module *mod)  again:  	mutex_lock(&module_mutex); -	if ((old = find_module_all(mod->name, true)) != NULL) { +	old = find_module_all(mod->name, strlen(mod->name), true); +	if (old != NULL) {  		if (old->state == MODULE_STATE_COMING  		    || old->state == MODULE_STATE_UNFORMED) {  			/* Wait in case it fails to load. */ @@ -3198,6 +3193,17 @@ out:  	return err;  } +static int unknown_module_param_cb(char *param, char *val, const char *modname) +{ +	/* Check for magic 'dyndbg' arg */  +	int ret = ddebug_dyndbg_module_param_cb(param, val, modname); +	if (ret != 0) { +		printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n", +		       modname, param); +	} +	return 0; +} +  /* Allocate and load the module: note that size of section 0 is always     zero, and we rely on this for optional sections. */  static int load_module(struct load_info *info, const char __user *uargs, @@ -3237,6 +3243,11 @@ static int load_module(struct load_info *info, const char __user *uargs,  	}  #endif +	/* To avoid stressing percpu allocator, do this once we're unique. */ +	err = percpu_modalloc(mod, info); +	if (err) +		goto unlink_mod; +  	/* Now module is in final location, initialize linked lists, etc. */  	err = module_unload_init(mod);  	if (err) @@ -3284,7 +3295,7 @@ static int load_module(struct load_info *info, const char __user *uargs,  	/* Module is ready to execute: parsing args may do that. */  	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -			 -32768, 32767, &ddebug_dyndbg_module_param_cb); +			 -32768, 32767, unknown_module_param_cb);  	if (err < 0)  		goto bug_cleanup; @@ -3563,10 +3574,8 @@ unsigned long module_kallsyms_lookup_name(const char *name)  	/* Don't lock: we're in enough trouble already. */  	preempt_disable();  	if ((colon = strchr(name, ':')) != NULL) { -		*colon = '\0'; -		if ((mod = find_module(name)) != NULL) +		if ((mod = find_module_all(name, colon - name, false)) != NULL)  			ret = mod_find_symname(mod, colon+1); -		*colon = ':';  	} else {  		list_for_each_entry_rcu(mod, &modules, list) {  			if (mod->state == MODULE_STATE_UNFORMED) diff --git a/kernel/mutex.c b/kernel/mutex.c index e581ada5faf4..ff05f4bd86eb 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -18,6 +18,7 @@   * Also see Documentation/mutex-design.txt.   */  #include <linux/mutex.h> +#include <linux/ww_mutex.h>  #include <linux/sched.h>  #include <linux/sched/rt.h>  #include <linux/export.h> diff --git a/kernel/panic.c b/kernel/panic.c index 167ec097ce8b..801864600514 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -15,6 +15,7 @@  #include <linux/notifier.h>  #include <linux/module.h>  #include <linux/random.h> +#include <linux/ftrace.h>  #include <linux/reboot.h>  #include <linux/delay.h>  #include <linux/kexec.h> @@ -399,8 +400,11 @@ struct slowpath_args {  static void warn_slowpath_common(const char *file, int line, void *caller,  				 unsigned taint, struct slowpath_args *args)  { -	printk(KERN_WARNING "------------[ cut here ]------------\n"); -	printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); +	disable_trace_on_warning(); + +	pr_warn("------------[ cut here ]------------\n"); +	pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", +		raw_smp_processor_id(), current->pid, file, line, caller);  	if (args)  		vprintk(args->fmt, args->args); diff --git a/kernel/params.c b/kernel/params.c index 53b958fcd639..440e65d1a544 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -787,7 +787,7 @@ static void __init kernel_add_sysfs_param(const char *name,  }  /* - * param_sysfs_builtin - add contents in /sys/parameters for built-in modules + * param_sysfs_builtin - add sysfs parameters for built-in modules   *   * Add module_parameters to sysfs for "modules" built into the kernel.   * diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 42670e9b44e0..c7f31aa272f7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock)  	return error;  } -static inline union cpu_time_count +static inline unsigned long long  timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)  { -	union cpu_time_count ret; -	ret.sched = 0;		/* high half always zero when .cpu used */ +	unsigned long long ret; + +	ret = 0;		/* high half always zero when .cpu used */  	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { -		ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; +		ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;  	} else { -		ret.cpu = timespec_to_cputime(tp); +		ret = cputime_to_expires(timespec_to_cputime(tp));  	}  	return ret;  }  static void sample_to_timespec(const clockid_t which_clock, -			       union cpu_time_count cpu, +			       unsigned long long expires,  			       struct timespec *tp)  {  	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) -		*tp = ns_to_timespec(cpu.sched); +		*tp = ns_to_timespec(expires);  	else -		cputime_to_timespec(cpu.cpu, tp); -} - -static inline int cpu_time_before(const clockid_t which_clock, -				  union cpu_time_count now, -				  union cpu_time_count then) -{ -	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { -		return now.sched < then.sched; -	}  else { -		return now.cpu < then.cpu; -	} -} -static inline void cpu_time_add(const clockid_t which_clock, -				union cpu_time_count *acc, -			        union cpu_time_count val) -{ -	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { -		acc->sched += val.sched; -	}  else { -		acc->cpu += val.cpu; -	} -} -static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, -						union cpu_time_count a, -						union cpu_time_count b) -{ -	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { -		a.sched -= b.sched; -	}  else { -		a.cpu -= b.cpu; -	} -	return a; +		cputime_to_timespec((__force cputime_t)expires, tp);  }  /* @@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,   * given the current clock sample.   */  static void bump_cpu_timer(struct k_itimer *timer, -				  union cpu_time_count now) +			   unsigned long long now)  {  	int i; +	unsigned long long delta, incr; -	if (timer->it.cpu.incr.sched == 0) +	if (timer->it.cpu.incr == 0)  		return; -	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { -		unsigned long long delta, incr; +	if (now < timer->it.cpu.expires) +		return; -		if (now.sched < timer->it.cpu.expires.sched) -			return; -		incr = timer->it.cpu.incr.sched; -		delta = now.sched + incr - timer->it.cpu.expires.sched; -		/* Don't use (incr*2 < delta), incr*2 might overflow. */ -		for (i = 0; incr < delta - incr; i++) -			incr = incr << 1; -		for (; i >= 0; incr >>= 1, i--) { -			if (delta < incr) -				continue; -			timer->it.cpu.expires.sched += incr; -			timer->it_overrun += 1 << i; -			delta -= incr; -		} -	} else { -		cputime_t delta, incr; +	incr = timer->it.cpu.incr; +	delta = now + incr - timer->it.cpu.expires; -		if (now.cpu < timer->it.cpu.expires.cpu) -			return; -		incr = timer->it.cpu.incr.cpu; -		delta = now.cpu + incr - timer->it.cpu.expires.cpu; -		/* Don't use (incr*2 < delta), incr*2 might overflow. */ -		for (i = 0; incr < delta - incr; i++) -			     incr += incr; -		for (; i >= 0; incr = incr >> 1, i--) { -			if (delta < incr) -				continue; -			timer->it.cpu.expires.cpu += incr; -			timer->it_overrun += 1 << i; -			delta -= incr; -		} +	/* Don't use (incr*2 < delta), incr*2 might overflow. */ +	for (i = 0; incr < delta - incr; i++) +		incr = incr << 1; + +	for (; i >= 0; incr >>= 1, i--) { +		if (delta < incr) +			continue; + +		timer->it.cpu.expires += incr; +		timer->it_overrun += 1 << i; +		delta -= incr;  	}  } @@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)  	return 0;  } -static inline cputime_t prof_ticks(struct task_struct *p) +static inline unsigned long long prof_ticks(struct task_struct *p)  {  	cputime_t utime, stime;  	task_cputime(p, &utime, &stime); -	return utime + stime; +	return cputime_to_expires(utime + stime);  } -static inline cputime_t virt_ticks(struct task_struct *p) +static inline unsigned long long virt_ticks(struct task_struct *p)  {  	cputime_t utime;  	task_cputime(p, &utime, NULL); -	return utime; +	return cputime_to_expires(utime);  }  static int @@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)   * Sample a per-thread clock for the given task.   */  static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, -			    union cpu_time_count *cpu) +			    unsigned long long *sample)  {  	switch (CPUCLOCK_WHICH(which_clock)) {  	default:  		return -EINVAL;  	case CPUCLOCK_PROF: -		cpu->cpu = prof_ticks(p); +		*sample = prof_ticks(p);  		break;  	case CPUCLOCK_VIRT: -		cpu->cpu = virt_ticks(p); +		*sample = virt_ticks(p);  		break;  	case CPUCLOCK_SCHED: -		cpu->sched = task_sched_runtime(p); +		*sample = task_sched_runtime(p);  		break;  	}  	return 0; @@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)   */  static int cpu_clock_sample_group(const clockid_t which_clock,  				  struct task_struct *p, -				  union cpu_time_count *cpu) +				  unsigned long long *sample)  {  	struct task_cputime cputime; @@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock,  		return -EINVAL;  	case CPUCLOCK_PROF:  		thread_group_cputime(p, &cputime); -		cpu->cpu = cputime.utime + cputime.stime; +		*sample = cputime_to_expires(cputime.utime + cputime.stime);  		break;  	case CPUCLOCK_VIRT:  		thread_group_cputime(p, &cputime); -		cpu->cpu = cputime.utime; +		*sample = cputime_to_expires(cputime.utime);  		break;  	case CPUCLOCK_SCHED:  		thread_group_cputime(p, &cputime); -		cpu->sched = cputime.sum_exec_runtime; +		*sample = cputime.sum_exec_runtime;  		break;  	}  	return 0; @@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)  {  	const pid_t pid = CPUCLOCK_PID(which_clock);  	int error = -EINVAL; -	union cpu_time_count rtn; +	unsigned long long rtn;  	if (pid == 0) {  		/* @@ -446,6 +399,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer)  	return ret;  } +static void cleanup_timers_list(struct list_head *head, +				unsigned long long curr) +{ +	struct cpu_timer_list *timer, *next; + +	list_for_each_entry_safe(timer, next, head, entry) +		list_del_init(&timer->entry); +} +  /*   * Clean out CPU timers still ticking when a thread exited.  The task   * pointer is cleared, and the expiry time is replaced with the residual @@ -456,37 +418,12 @@ static void cleanup_timers(struct list_head *head,  			   cputime_t utime, cputime_t stime,  			   unsigned long long sum_exec_runtime)  { -	struct cpu_timer_list *timer, *next; -	cputime_t ptime = utime + stime; - -	list_for_each_entry_safe(timer, next, head, entry) { -		list_del_init(&timer->entry); -		if (timer->expires.cpu < ptime) { -			timer->expires.cpu = 0; -		} else { -			timer->expires.cpu -= ptime; -		} -	} -	++head; -	list_for_each_entry_safe(timer, next, head, entry) { -		list_del_init(&timer->entry); -		if (timer->expires.cpu < utime) { -			timer->expires.cpu = 0; -		} else { -			timer->expires.cpu -= utime; -		} -	} +	cputime_t ptime = utime + stime; -	++head; -	list_for_each_entry_safe(timer, next, head, entry) { -		list_del_init(&timer->entry); -		if (timer->expires.sched < sum_exec_runtime) { -			timer->expires.sched = 0; -		} else { -			timer->expires.sched -= sum_exec_runtime; -		} -	} +	cleanup_timers_list(head, cputime_to_expires(ptime)); +	cleanup_timers_list(++head, cputime_to_expires(utime)); +	cleanup_timers_list(++head, sum_exec_runtime);  }  /* @@ -516,17 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)  		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);  } -static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) +static void clear_dead_task(struct k_itimer *itimer, unsigned long long now)  { +	struct cpu_timer_list *timer = &itimer->it.cpu; +  	/*  	 * That's all for this thread or process.  	 * We leave our residual in expires to be reported.  	 */ -	put_task_struct(timer->it.cpu.task); -	timer->it.cpu.task = NULL; -	timer->it.cpu.expires = cpu_time_sub(timer->it_clock, -					     timer->it.cpu.expires, -					     now); +	put_task_struct(timer->task); +	timer->task = NULL; +	if (timer->expires < now) { +		timer->expires = 0; +	} else { +		timer->expires -= now; +	}  }  static inline int expires_gt(cputime_t expires, cputime_t new_exp) @@ -558,14 +499,14 @@ static void arm_timer(struct k_itimer *timer)  	listpos = head;  	list_for_each_entry(next, head, entry) { -		if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) +		if (nt->expires < next->expires)  			break;  		listpos = &next->entry;  	}  	list_add(&nt->entry, listpos);  	if (listpos == head) { -		union cpu_time_count *exp = &nt->expires; +		unsigned long long exp = nt->expires;  		/*  		 * We are the new earliest-expiring POSIX 1.b timer, hence @@ -576,17 +517,17 @@ static void arm_timer(struct k_itimer *timer)  		switch (CPUCLOCK_WHICH(timer->it_clock)) {  		case CPUCLOCK_PROF: -			if (expires_gt(cputime_expires->prof_exp, exp->cpu)) -				cputime_expires->prof_exp = exp->cpu; +			if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) +				cputime_expires->prof_exp = expires_to_cputime(exp);  			break;  		case CPUCLOCK_VIRT: -			if (expires_gt(cputime_expires->virt_exp, exp->cpu)) -				cputime_expires->virt_exp = exp->cpu; +			if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) +				cputime_expires->virt_exp = expires_to_cputime(exp);  			break;  		case CPUCLOCK_SCHED:  			if (cputime_expires->sched_exp == 0 || -			    cputime_expires->sched_exp > exp->sched) -				cputime_expires->sched_exp = exp->sched; +			    cputime_expires->sched_exp > exp) +				cputime_expires->sched_exp = exp;  			break;  		}  	} @@ -601,20 +542,20 @@ static void cpu_timer_fire(struct k_itimer *timer)  		/*  		 * User don't want any signal.  		 */ -		timer->it.cpu.expires.sched = 0; +		timer->it.cpu.expires = 0;  	} else if (unlikely(timer->sigq == NULL)) {  		/*  		 * This a special case for clock_nanosleep,  		 * not a normal timer from sys_timer_create.  		 */  		wake_up_process(timer->it_process); -		timer->it.cpu.expires.sched = 0; -	} else if (timer->it.cpu.incr.sched == 0) { +		timer->it.cpu.expires = 0; +	} else if (timer->it.cpu.incr == 0) {  		/*  		 * One-shot timer.  Clear it as soon as it's fired.  		 */  		posix_timer_event(timer, 0); -		timer->it.cpu.expires.sched = 0; +		timer->it.cpu.expires = 0;  	} else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {  		/*  		 * The signal did not get queued because the signal @@ -632,7 +573,7 @@ static void cpu_timer_fire(struct k_itimer *timer)   */  static int cpu_timer_sample_group(const clockid_t which_clock,  				  struct task_struct *p, -				  union cpu_time_count *cpu) +				  unsigned long long *sample)  {  	struct task_cputime cputime; @@ -641,13 +582,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock,  	default:  		return -EINVAL;  	case CPUCLOCK_PROF: -		cpu->cpu = cputime.utime + cputime.stime; +		*sample = cputime_to_expires(cputime.utime + cputime.stime);  		break;  	case CPUCLOCK_VIRT: -		cpu->cpu = cputime.utime; +		*sample = cputime_to_expires(cputime.utime);  		break;  	case CPUCLOCK_SCHED: -		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); +		*sample = cputime.sum_exec_runtime + task_delta_exec(p);  		break;  	}  	return 0; @@ -694,7 +635,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,  			       struct itimerspec *new, struct itimerspec *old)  {  	struct task_struct *p = timer->it.cpu.task; -	union cpu_time_count old_expires, new_expires, old_incr, val; +	unsigned long long old_expires, new_expires, old_incr, val;  	int ret;  	if (unlikely(p == NULL)) { @@ -749,7 +690,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,  	}  	if (old) { -		if (old_expires.sched == 0) { +		if (old_expires == 0) {  			old->it_value.tv_sec = 0;  			old->it_value.tv_nsec = 0;  		} else { @@ -764,11 +705,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,  			 * new setting.  			 */  			bump_cpu_timer(timer, val); -			if (cpu_time_before(timer->it_clock, val, -					    timer->it.cpu.expires)) { -				old_expires = cpu_time_sub( -					timer->it_clock, -					timer->it.cpu.expires, val); +			if (val < timer->it.cpu.expires) { +				old_expires = timer->it.cpu.expires - val;  				sample_to_timespec(timer->it_clock,  						   old_expires,  						   &old->it_value); @@ -791,8 +729,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,  		goto out;  	} -	if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { -		cpu_time_add(timer->it_clock, &new_expires, val); +	if (new_expires != 0 && !(flags & TIMER_ABSTIME)) { +		new_expires += val;  	}  	/* @@ -801,8 +739,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,  	 * arm the timer (we'll just fake it for timer_gettime).  	 */  	timer->it.cpu.expires = new_expires; -	if (new_expires.sched != 0 && -	    cpu_time_before(timer->it_clock, val, new_expires)) { +	if (new_expires != 0 && val < new_expires) {  		arm_timer(timer);  	} @@ -826,8 +763,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,  	timer->it_overrun_last = 0;  	timer->it_overrun = -1; -	if (new_expires.sched != 0 && -	    !cpu_time_before(timer->it_clock, val, new_expires)) { +	if (new_expires != 0 && !(val < new_expires)) {  		/*  		 * The designated time already passed, so we notify  		 * immediately, even if the thread never runs to @@ -849,7 +785,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,  static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)  { -	union cpu_time_count now; +	unsigned long long now;  	struct task_struct *p = timer->it.cpu.task;  	int clear_dead; @@ -859,7 +795,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)  	sample_to_timespec(timer->it_clock,  			   timer->it.cpu.incr, &itp->it_interval); -	if (timer->it.cpu.expires.sched == 0) {	/* Timer not armed at all.  */ +	if (timer->it.cpu.expires == 0) {	/* Timer not armed at all.  */  		itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;  		return;  	} @@ -891,7 +827,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)  			 */  			put_task_struct(p);  			timer->it.cpu.task = NULL; -			timer->it.cpu.expires.sched = 0; +			timer->it.cpu.expires = 0;  			read_unlock(&tasklist_lock);  			goto dead;  		} else { @@ -912,10 +848,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)  		goto dead;  	} -	if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { +	if (now < timer->it.cpu.expires) {  		sample_to_timespec(timer->it_clock, -				   cpu_time_sub(timer->it_clock, -						timer->it.cpu.expires, now), +				   timer->it.cpu.expires - now,  				   &itp->it_value);  	} else {  		/* @@ -927,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)  	}  } +static unsigned long long +check_timers_list(struct list_head *timers, +		  struct list_head *firing, +		  unsigned long long curr) +{ +	int maxfire = 20; + +	while (!list_empty(timers)) { +		struct cpu_timer_list *t; + +		t = list_first_entry(timers, struct cpu_timer_list, entry); + +		if (!--maxfire || curr < t->expires) +			return t->expires; + +		t->firing = 1; +		list_move_tail(&t->entry, firing); +	} + +	return 0; +} +  /*   * Check for any per-thread CPU timers that have fired and move them off   * the tsk->cpu_timers[N] list onto the firing list.  Here we update the @@ -935,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)  static void check_thread_timers(struct task_struct *tsk,  				struct list_head *firing)  { -	int maxfire;  	struct list_head *timers = tsk->cpu_timers;  	struct signal_struct *const sig = tsk->signal; +	struct task_cputime *tsk_expires = &tsk->cputime_expires; +	unsigned long long expires;  	unsigned long soft; -	maxfire = 20; -	tsk->cputime_expires.prof_exp = 0; -	while (!list_empty(timers)) { -		struct cpu_timer_list *t = list_first_entry(timers, -						      struct cpu_timer_list, -						      entry); -		if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { -			tsk->cputime_expires.prof_exp = t->expires.cpu; -			break; -		} -		t->firing = 1; -		list_move_tail(&t->entry, firing); -	} +	expires = check_timers_list(timers, firing, prof_ticks(tsk)); +	tsk_expires->prof_exp = expires_to_cputime(expires); -	++timers; -	maxfire = 20; -	tsk->cputime_expires.virt_exp = 0; -	while (!list_empty(timers)) { -		struct cpu_timer_list *t = list_first_entry(timers, -						      struct cpu_timer_list, -						      entry); -		if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { -			tsk->cputime_expires.virt_exp = t->expires.cpu; -			break; -		} -		t->firing = 1; -		list_move_tail(&t->entry, firing); -	} +	expires = check_timers_list(++timers, firing, virt_ticks(tsk)); +	tsk_expires->virt_exp = expires_to_cputime(expires); -	++timers; -	maxfire = 20; -	tsk->cputime_expires.sched_exp = 0; -	while (!list_empty(timers)) { -		struct cpu_timer_list *t = list_first_entry(timers, -						      struct cpu_timer_list, -						      entry); -		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { -			tsk->cputime_expires.sched_exp = t->expires.sched; -			break; -		} -		t->firing = 1; -		list_move_tail(&t->entry, firing); -	} +	tsk_expires->sched_exp = check_timers_list(++timers, firing, +						   tsk->se.sum_exec_runtime);  	/*  	 * Check for the special case thread timers. @@ -1030,7 +953,8 @@ static void stop_process_timers(struct signal_struct *sig)  static u32 onecputick;  static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, -			     cputime_t *expires, cputime_t cur_time, int signo) +			     unsigned long long *expires, +			     unsigned long long cur_time, int signo)  {  	if (!it->expires)  		return; @@ -1066,9 +990,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,  static void check_process_timers(struct task_struct *tsk,  				 struct list_head *firing)  { -	int maxfire;  	struct signal_struct *const sig = tsk->signal; -	cputime_t utime, ptime, virt_expires, prof_expires; +	unsigned long long utime, ptime, virt_expires, prof_expires;  	unsigned long long sum_sched_runtime, sched_expires;  	struct list_head *timers = sig->cpu_timers;  	struct task_cputime cputime; @@ -1078,52 +1001,13 @@ static void check_process_timers(struct task_struct *tsk,  	 * Collect the current process totals.  	 */  	thread_group_cputimer(tsk, &cputime); -	utime = cputime.utime; -	ptime = utime + cputime.stime; +	utime = cputime_to_expires(cputime.utime); +	ptime = utime + cputime_to_expires(cputime.stime);  	sum_sched_runtime = cputime.sum_exec_runtime; -	maxfire = 20; -	prof_expires = 0; -	while (!list_empty(timers)) { -		struct cpu_timer_list *tl = list_first_entry(timers, -						      struct cpu_timer_list, -						      entry); -		if (!--maxfire || ptime < tl->expires.cpu) { -			prof_expires = tl->expires.cpu; -			break; -		} -		tl->firing = 1; -		list_move_tail(&tl->entry, firing); -	} -	++timers; -	maxfire = 20; -	virt_expires = 0; -	while (!list_empty(timers)) { -		struct cpu_timer_list *tl = list_first_entry(timers, -						      struct cpu_timer_list, -						      entry); -		if (!--maxfire || utime < tl->expires.cpu) { -			virt_expires = tl->expires.cpu; -			break; -		} -		tl->firing = 1; -		list_move_tail(&tl->entry, firing); -	} - -	++timers; -	maxfire = 20; -	sched_expires = 0; -	while (!list_empty(timers)) { -		struct cpu_timer_list *tl = list_first_entry(timers, -						      struct cpu_timer_list, -						      entry); -		if (!--maxfire || sum_sched_runtime < tl->expires.sched) { -			sched_expires = tl->expires.sched; -			break; -		} -		tl->firing = 1; -		list_move_tail(&tl->entry, firing); -	} +	prof_expires = check_timers_list(timers, firing, ptime); +	virt_expires = check_timers_list(++timers, firing, utime); +	sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);  	/*  	 * Check for the special case process timers. @@ -1162,8 +1046,8 @@ static void check_process_timers(struct task_struct *tsk,  		}  	} -	sig->cputime_expires.prof_exp = prof_expires; -	sig->cputime_expires.virt_exp = virt_expires; +	sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); +	sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);  	sig->cputime_expires.sched_exp = sched_expires;  	if (task_cputime_zero(&sig->cputime_expires))  		stop_process_timers(sig); @@ -1176,7 +1060,7 @@ static void check_process_timers(struct task_struct *tsk,  void posix_cpu_timer_schedule(struct k_itimer *timer)  {  	struct task_struct *p = timer->it.cpu.task; -	union cpu_time_count now; +	unsigned long long now;  	if (unlikely(p == NULL))  		/* @@ -1205,7 +1089,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)  			 */  			put_task_struct(p);  			timer->it.cpu.task = p = NULL; -			timer->it.cpu.expires.sched = 0; +			timer->it.cpu.expires = 0;  			goto out_unlock;  		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {  			/* @@ -1213,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)  			 * not yet reaped.  Take this opportunity to  			 * drop our task ref.  			 */ +			cpu_timer_sample_group(timer->it_clock, p, &now);  			clear_dead_task(timer, now);  			goto out_unlock;  		} @@ -1387,7 +1272,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)  void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,  			   cputime_t *newval, cputime_t *oldval)  { -	union cpu_time_count now; +	unsigned long long now;  	BUG_ON(clock_idx == CPUCLOCK_SCHED);  	cpu_timer_sample_group(clock_idx, tsk, &now); @@ -1399,17 +1284,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,  		 * it to be absolute.  		 */  		if (*oldval) { -			if (*oldval <= now.cpu) { +			if (*oldval <= now) {  				/* Just about to fire. */  				*oldval = cputime_one_jiffy;  			} else { -				*oldval -= now.cpu; +				*oldval -= now;  			}  		}  		if (!*newval)  			goto out; -		*newval += now.cpu; +		*newval += now;  	}  	/* @@ -1459,7 +1344,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,  		}  		while (!signal_pending(current)) { -			if (timer.it.cpu.expires.sched == 0) { +			if (timer.it.cpu.expires == 0) {  				/*  				 * Our timer fired and was reset, below  				 * deletion can not fail. diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index c6422ffeda9a..9012ecf7b814 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -32,7 +32,8 @@ static void try_to_suspend(struct work_struct *work)  	mutex_lock(&autosleep_lock); -	if (!pm_save_wakeup_count(initial_count)) { +	if (!pm_save_wakeup_count(initial_count) || +		system_state != SYSTEM_RUNNING) {  		mutex_unlock(&autosleep_lock);  		goto out;  	} diff --git a/kernel/power/process.c b/kernel/power/process.c index fc0df8486449..06ec8869dbf1 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -109,6 +109,8 @@ static int try_to_freeze_tasks(bool user_only)  /**   * freeze_processes - Signal user space processes to enter the refrigerator. + * The current thread will not be frozen.  The same process that calls + * freeze_processes must later call thaw_processes.   *   * On success, returns 0.  On failure, -errno and system is fully thawed.   */ @@ -120,6 +122,9 @@ int freeze_processes(void)  	if (error)  		return error; +	/* Make sure this task doesn't get frozen */ +	current->flags |= PF_SUSPEND_TASK; +  	if (!pm_freezing)  		atomic_inc(&system_freezing_cnt); @@ -168,6 +173,7 @@ int freeze_kernel_threads(void)  void thaw_processes(void)  {  	struct task_struct *g, *p; +	struct task_struct *curr = current;  	if (pm_freezing)  		atomic_dec(&system_freezing_cnt); @@ -182,10 +188,15 @@ void thaw_processes(void)  	read_lock(&tasklist_lock);  	do_each_thread(g, p) { +		/* No other threads should have PF_SUSPEND_TASK set */ +		WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));  		__thaw_task(p);  	} while_each_thread(g, p);  	read_unlock(&tasklist_lock); +	WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); +	curr->flags &= ~PF_SUSPEND_TASK; +  	usermodehelper_enable();  	schedule(); diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile new file mode 100644 index 000000000000..85405bdcf2b3 --- /dev/null +++ b/kernel/printk/Makefile @@ -0,0 +1,2 @@ +obj-y	= printk.o +obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c new file mode 100644 index 000000000000..276762f3a460 --- /dev/null +++ b/kernel/printk/braille.c @@ -0,0 +1,49 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/console.h> +#include <linux/string.h> + +#include "console_cmdline.h" +#include "braille.h" + +char *_braille_console_setup(char **str, char **brl_options) +{ +	if (!memcmp(*str, "brl,", 4)) { +		*brl_options = ""; +		*str += 4; +	} else if (!memcmp(str, "brl=", 4)) { +		*brl_options = *str + 4; +		*str = strchr(*brl_options, ','); +		if (!*str) +			pr_err("need port name after brl=\n"); +		else +			*((*str)++) = 0; +	} else +		return NULL; + +	return *str; +} + +int +_braille_register_console(struct console *console, struct console_cmdline *c) +{ +	int rtn = 0; + +	if (c->brl_options) { +		console->flags |= CON_BRL; +		rtn = braille_register_console(console, c->index, c->options, +					       c->brl_options); +	} + +	return rtn; +} + +int +_braille_unregister_console(struct console *console) +{ +	if (console->flags & CON_BRL) +		return braille_unregister_console(console); + +	return 0; +} diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h new file mode 100644 index 000000000000..769d771145c8 --- /dev/null +++ b/kernel/printk/braille.h @@ -0,0 +1,48 @@ +#ifndef _PRINTK_BRAILLE_H +#define _PRINTK_BRAILLE_H + +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + +static inline void +braille_set_options(struct console_cmdline *c, char *brl_options) +{ +	c->brl_options = brl_options; +} + +char * +_braille_console_setup(char **str, char **brl_options); + +int +_braille_register_console(struct console *console, struct console_cmdline *c); + +int +_braille_unregister_console(struct console *console); + +#else + +static inline void +braille_set_options(struct console_cmdline *c, char *brl_options) +{ +} + +static inline char * +_braille_console_setup(char **str, char **brl_options) +{ +	return NULL; +} + +static inline int +_braille_register_console(struct console *console, struct console_cmdline *c) +{ +	return 0; +} + +static inline int +_braille_unregister_console(struct console *console) +{ +	return 0; +} + +#endif + +#endif diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h new file mode 100644 index 000000000000..cbd69d842341 --- /dev/null +++ b/kernel/printk/console_cmdline.h @@ -0,0 +1,14 @@ +#ifndef _CONSOLE_CMDLINE_H +#define _CONSOLE_CMDLINE_H + +struct console_cmdline +{ +	char	name[8];			/* Name of the driver	    */ +	int	index;				/* Minor dev. to use	    */ +	char	*options;			/* Options for the driver   */ +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE +	char	*brl_options;			/* Options for braille driver */ +#endif +}; + +#endif diff --git a/kernel/printk.c b/kernel/printk/printk.c index 8212c1aef125..5b5a7080e2a5 100644 --- a/kernel/printk.c +++ b/kernel/printk/printk.c @@ -51,6 +51,9 @@  #define CREATE_TRACE_POINTS  #include <trace/events/printk.h> +#include "console_cmdline.h" +#include "braille.h" +  /* printk's without a loglevel use this.. */  #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL @@ -105,19 +108,11 @@ static struct console *exclusive_console;  /*   *	Array of consoles built from command line options (console=)   */ -struct console_cmdline -{ -	char	name[8];			/* Name of the driver	    */ -	int	index;				/* Minor dev. to use	    */ -	char	*options;			/* Options for the driver   */ -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE -	char	*brl_options;			/* Options for braille driver */ -#endif -};  #define MAX_CMDLINECONSOLES 8  static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; +  static int selected_console = -1;  static int preferred_console = -1;  int console_set_on_cmdline; @@ -178,7 +173,7 @@ static int console_may_schedule;   *         67                           "g"   *   0032     00 00 00                  padding to next message header   * - * The 'struct log' buffer header must never be directly exported to + * The 'struct printk_log' buffer header must never be directly exported to   * userspace, it is a kernel-private implementation detail that might   * need to be changed in the future, when the requirements change.   * @@ -200,7 +195,7 @@ enum log_flags {  	LOG_CONT	= 8,	/* text is a fragment of a continuation line */  }; -struct log { +struct printk_log {  	u64 ts_nsec;		/* timestamp in nanoseconds */  	u16 len;		/* length of entire record */  	u16 text_len;		/* length of text buffer */ @@ -248,7 +243,7 @@ static u32 clear_idx;  #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)  #define LOG_ALIGN 4  #else -#define LOG_ALIGN __alignof__(struct log) +#define LOG_ALIGN __alignof__(struct printk_log)  #endif  #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)  static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); @@ -259,35 +254,35 @@ static u32 log_buf_len = __LOG_BUF_LEN;  static volatile unsigned int logbuf_cpu = UINT_MAX;  /* human readable text of the record */ -static char *log_text(const struct log *msg) +static char *log_text(const struct printk_log *msg)  { -	return (char *)msg + sizeof(struct log); +	return (char *)msg + sizeof(struct printk_log);  }  /* optional key/value pair dictionary attached to the record */ -static char *log_dict(const struct log *msg) +static char *log_dict(const struct printk_log *msg)  { -	return (char *)msg + sizeof(struct log) + msg->text_len; +	return (char *)msg + sizeof(struct printk_log) + msg->text_len;  }  /* get record by index; idx must point to valid msg */ -static struct log *log_from_idx(u32 idx) +static struct printk_log *log_from_idx(u32 idx)  { -	struct log *msg = (struct log *)(log_buf + idx); +	struct printk_log *msg = (struct printk_log *)(log_buf + idx);  	/*  	 * A length == 0 record is the end of buffer marker. Wrap around and  	 * read the message at the start of the buffer.  	 */  	if (!msg->len) -		return (struct log *)log_buf; +		return (struct printk_log *)log_buf;  	return msg;  }  /* get next record; idx must point to valid msg */  static u32 log_next(u32 idx)  { -	struct log *msg = (struct log *)(log_buf + idx); +	struct printk_log *msg = (struct printk_log *)(log_buf + idx);  	/* length == 0 indicates the end of the buffer; wrap */  	/* @@ -296,7 +291,7 @@ static u32 log_next(u32 idx)  	 * return the one after that.  	 */  	if (!msg->len) { -		msg = (struct log *)log_buf; +		msg = (struct printk_log *)log_buf;  		return msg->len;  	}  	return idx + msg->len; @@ -308,11 +303,11 @@ static void log_store(int facility, int level,  		      const char *dict, u16 dict_len,  		      const char *text, u16 text_len)  { -	struct log *msg; +	struct printk_log *msg;  	u32 size, pad_len;  	/* number of '\0' padding bytes to next message */ -	size = sizeof(struct log) + text_len + dict_len; +	size = sizeof(struct printk_log) + text_len + dict_len;  	pad_len = (-size) & (LOG_ALIGN - 1);  	size += pad_len; @@ -324,7 +319,7 @@ static void log_store(int facility, int level,  		else  			free = log_first_idx - log_next_idx; -		if (free > size + sizeof(struct log)) +		if (free > size + sizeof(struct printk_log))  			break;  		/* drop old messages until we have enough contiuous space */ @@ -332,18 +327,18 @@ static void log_store(int facility, int level,  		log_first_seq++;  	} -	if (log_next_idx + size + sizeof(struct log) >= log_buf_len) { +	if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) {  		/*  		 * This message + an additional empty header does not fit  		 * at the end of the buffer. Add an empty header with len == 0  		 * to signify a wrap around.  		 */ -		memset(log_buf + log_next_idx, 0, sizeof(struct log)); +		memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));  		log_next_idx = 0;  	}  	/* fill message */ -	msg = (struct log *)(log_buf + log_next_idx); +	msg = (struct printk_log *)(log_buf + log_next_idx);  	memcpy(log_text(msg), text, text_len);  	msg->text_len = text_len;  	memcpy(log_dict(msg), dict, dict_len); @@ -356,7 +351,7 @@ static void log_store(int facility, int level,  	else  		msg->ts_nsec = local_clock();  	memset(log_dict(msg) + dict_len, 0, pad_len); -	msg->len = sizeof(struct log) + text_len + dict_len + pad_len; +	msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len;  	/* insert message */  	log_next_idx += msg->len; @@ -479,7 +474,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,  			    size_t count, loff_t *ppos)  {  	struct devkmsg_user *user = file->private_data; -	struct log *msg; +	struct printk_log *msg;  	u64 ts_usec;  	size_t i;  	char cont = '-'; @@ -724,14 +719,14 @@ void log_buf_kexec_setup(void)  	VMCOREINFO_SYMBOL(log_first_idx);  	VMCOREINFO_SYMBOL(log_next_idx);  	/* -	 * Export struct log size and field offsets. User space tools can +	 * Export struct printk_log size and field offsets. User space tools can  	 * parse it and detect any changes to structure down the line.  	 */ -	VMCOREINFO_STRUCT_SIZE(log); -	VMCOREINFO_OFFSET(log, ts_nsec); -	VMCOREINFO_OFFSET(log, len); -	VMCOREINFO_OFFSET(log, text_len); -	VMCOREINFO_OFFSET(log, dict_len); +	VMCOREINFO_STRUCT_SIZE(printk_log); +	VMCOREINFO_OFFSET(printk_log, ts_nsec); +	VMCOREINFO_OFFSET(printk_log, len); +	VMCOREINFO_OFFSET(printk_log, text_len); +	VMCOREINFO_OFFSET(printk_log, dict_len);  }  #endif @@ -884,7 +879,7 @@ static size_t print_time(u64 ts, char *buf)  		       (unsigned long)ts, rem_nsec / 1000);  } -static size_t print_prefix(const struct log *msg, bool syslog, char *buf) +static size_t print_prefix(const struct printk_log *msg, bool syslog, char *buf)  {  	size_t len = 0;  	unsigned int prefix = (msg->facility << 3) | msg->level; @@ -907,7 +902,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf)  	return len;  } -static size_t msg_print_text(const struct log *msg, enum log_flags prev, +static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,  			     bool syslog, char *buf, size_t size)  {  	const char *text = log_text(msg); @@ -969,7 +964,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,  static int syslog_print(char __user *buf, int size)  {  	char *text; -	struct log *msg; +	struct printk_log *msg;  	int len = 0;  	text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); @@ -1060,7 +1055,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  		idx = clear_idx;  		prev = 0;  		while (seq < log_next_seq) { -			struct log *msg = log_from_idx(idx); +			struct printk_log *msg = log_from_idx(idx);  			len += msg_print_text(msg, prev, true, NULL, 0);  			prev = msg->flags; @@ -1073,7 +1068,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  		idx = clear_idx;  		prev = 0;  		while (len > size && seq < log_next_seq) { -			struct log *msg = log_from_idx(idx); +			struct printk_log *msg = log_from_idx(idx);  			len -= msg_print_text(msg, prev, true, NULL, 0);  			prev = msg->flags; @@ -1087,7 +1082,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)  		len = 0;  		prev = 0;  		while (len >= 0 && seq < next_seq) { -			struct log *msg = log_from_idx(idx); +			struct printk_log *msg = log_from_idx(idx);  			int textlen;  			textlen = msg_print_text(msg, prev, true, text, @@ -1233,7 +1228,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  			error = 0;  			while (seq < log_next_seq) { -				struct log *msg = log_from_idx(idx); +				struct printk_log *msg = log_from_idx(idx);  				error += msg_print_text(msg, prev, true, NULL, 0);  				idx = log_next(idx); @@ -1369,9 +1364,9 @@ static int console_trylock_for_printk(unsigned int cpu)  		}  	}  	logbuf_cpu = UINT_MAX; +	raw_spin_unlock(&logbuf_lock);  	if (wake)  		up(&console_sem); -	raw_spin_unlock(&logbuf_lock);  	return retval;  } @@ -1719,10 +1714,10 @@ static struct cont {  	u8 level;  	bool flushed:1;  } cont; -static struct log *log_from_idx(u32 idx) { return NULL; } +static struct printk_log *log_from_idx(u32 idx) { return NULL; }  static u32 log_next(u32 idx) { return 0; }  static void call_console_drivers(int level, const char *text, size_t len) {} -static size_t msg_print_text(const struct log *msg, enum log_flags prev, +static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,  			     bool syslog, char *buf, size_t size) { return 0; }  static size_t cont_print_text(char *text, size_t size) { return 0; } @@ -1761,23 +1756,23 @@ static int __add_preferred_console(char *name, int idx, char *options,  	 *	See if this tty is not yet registered, and  	 *	if we have a slot free.  	 */ -	for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) -		if (strcmp(console_cmdline[i].name, name) == 0 && -			  console_cmdline[i].index == idx) { -				if (!brl_options) -					selected_console = i; -				return 0; +	for (i = 0, c = console_cmdline; +	     i < MAX_CMDLINECONSOLES && c->name[0]; +	     i++, c++) { +		if (strcmp(c->name, name) == 0 && c->index == idx) { +			if (!brl_options) +				selected_console = i; +			return 0;  		} +	}  	if (i == MAX_CMDLINECONSOLES)  		return -E2BIG;  	if (!brl_options)  		selected_console = i; -	c = &console_cmdline[i];  	strlcpy(c->name, name, sizeof(c->name));  	c->options = options; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE -	c->brl_options = brl_options; -#endif +	braille_set_options(c, brl_options); +  	c->index = idx;  	return 0;  } @@ -1790,20 +1785,8 @@ static int __init console_setup(char *str)  	char *s, *options, *brl_options = NULL;  	int idx; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE -	if (!memcmp(str, "brl,", 4)) { -		brl_options = ""; -		str += 4; -	} else if (!memcmp(str, "brl=", 4)) { -		brl_options = str + 4; -		str = strchr(brl_options, ','); -		if (!str) { -			printk(KERN_ERR "need port name after brl=\n"); -			return 1; -		} -		*(str++) = 0; -	} -#endif +	if (_braille_console_setup(&str, &brl_options)) +		return 1;  	/*  	 * Decode str into name, index, options. @@ -1858,15 +1841,15 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha  	struct console_cmdline *c;  	int i; -	for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) -		if (strcmp(console_cmdline[i].name, name) == 0 && -			  console_cmdline[i].index == idx) { -				c = &console_cmdline[i]; -				strlcpy(c->name, name_new, sizeof(c->name)); -				c->name[sizeof(c->name) - 1] = 0; -				c->options = options; -				c->index = idx_new; -				return i; +	for (i = 0, c = console_cmdline; +	     i < MAX_CMDLINECONSOLES && c->name[0]; +	     i++, c++) +		if (strcmp(c->name, name) == 0 && c->index == idx) { +			strlcpy(c->name, name_new, sizeof(c->name)); +			c->name[sizeof(c->name) - 1] = 0; +			c->options = options; +			c->index = idx_new; +			return i;  		}  	/* not found */  	return -1; @@ -1921,7 +1904,7 @@ void resume_console(void)   * called when a new CPU comes online (or fails to come up), and ensures   * that any such output gets printed.   */ -static int __cpuinit console_cpu_notify(struct notifier_block *self, +static int console_cpu_notify(struct notifier_block *self,  	unsigned long action, void *hcpu)  {  	switch (action) { @@ -2046,7 +2029,7 @@ void console_unlock(void)  	console_cont_flush(text, sizeof(text));  again:  	for (;;) { -		struct log *msg; +		struct printk_log *msg;  		size_t len;  		int level; @@ -2241,6 +2224,7 @@ void register_console(struct console *newcon)  	int i;  	unsigned long flags;  	struct console *bcon = NULL; +	struct console_cmdline *c;  	/*  	 * before we register a new CON_BOOT console, make sure we don't @@ -2288,30 +2272,25 @@ void register_console(struct console *newcon)  	 *	See if this console matches one we selected on  	 *	the command line.  	 */ -	for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; -			i++) { -		if (strcmp(console_cmdline[i].name, newcon->name) != 0) +	for (i = 0, c = console_cmdline; +	     i < MAX_CMDLINECONSOLES && c->name[0]; +	     i++, c++) { +		if (strcmp(c->name, newcon->name) != 0)  			continue;  		if (newcon->index >= 0 && -		    newcon->index != console_cmdline[i].index) +		    newcon->index != c->index)  			continue;  		if (newcon->index < 0) -			newcon->index = console_cmdline[i].index; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE -		if (console_cmdline[i].brl_options) { -			newcon->flags |= CON_BRL; -			braille_register_console(newcon, -					console_cmdline[i].index, -					console_cmdline[i].options, -					console_cmdline[i].brl_options); +			newcon->index = c->index; + +		if (_braille_register_console(newcon, c))  			return; -		} -#endif +  		if (newcon->setup &&  		    newcon->setup(newcon, console_cmdline[i].options) != 0)  			break;  		newcon->flags |= CON_ENABLED; -		newcon->index = console_cmdline[i].index; +		newcon->index = c->index;  		if (i == selected_console) {  			newcon->flags |= CON_CONSDEV;  			preferred_console = selected_console; @@ -2394,13 +2373,13 @@ EXPORT_SYMBOL(register_console);  int unregister_console(struct console *console)  {          struct console *a, *b; -	int res = 1; +	int res; -#ifdef CONFIG_A11Y_BRAILLE_CONSOLE -	if (console->flags & CON_BRL) -		return braille_unregister_console(console); -#endif +	res = _braille_unregister_console(console); +	if (res) +		return res; +	res = 1;  	console_lock();  	if (console_drivers == console) {  		console_drivers=console->next; @@ -2666,7 +2645,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)  bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,  			       char *line, size_t size, size_t *len)  { -	struct log *msg; +	struct printk_log *msg;  	size_t l = 0;  	bool ret = false; @@ -2778,7 +2757,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,  	idx = dumper->cur_idx;  	prev = 0;  	while (seq < dumper->next_seq) { -		struct log *msg = log_from_idx(idx); +		struct printk_log *msg = log_from_idx(idx);  		l += msg_print_text(msg, prev, true, NULL, 0);  		idx = log_next(idx); @@ -2791,7 +2770,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,  	idx = dumper->cur_idx;  	prev = 0;  	while (l > size && seq < dumper->next_seq) { -		struct log *msg = log_from_idx(idx); +		struct printk_log *msg = log_from_idx(idx);  		l -= msg_print_text(msg, prev, true, NULL, 0);  		idx = log_next(idx); @@ -2806,7 +2785,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,  	l = 0;  	prev = 0;  	while (seq < dumper->next_seq) { -		struct log *msg = log_from_idx(idx); +		struct printk_log *msg = log_from_idx(idx);  		l += msg_print_text(msg, prev, syslog, buf + l, size - l);  		idx = log_next(idx); diff --git a/kernel/profile.c b/kernel/profile.c index 0bf400737660..6631e1ef55ab 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -331,7 +331,7 @@ out:  	put_cpu();  } -static int __cpuinit profile_cpu_callback(struct notifier_block *info, +static int profile_cpu_callback(struct notifier_block *info,  					unsigned long action, void *__cpu)  {  	int node, cpu = (unsigned long)__cpu; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ba5e6cea181a..a146ee327f6a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1221,19 +1221,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,  	return ret;  }  #endif	/* CONFIG_COMPAT */ - -#ifdef CONFIG_HAVE_HW_BREAKPOINT -int ptrace_get_breakpoints(struct task_struct *tsk) -{ -	if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) -		return 0; - -	return -1; -} - -void ptrace_put_breakpoints(struct task_struct *tsk) -{ -	if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) -		flush_ptrace_hw_breakpoint(tsk); -} -#endif /* CONFIG_HAVE_HW_BREAKPOINT */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index b1fa5510388d..f4871e52c546 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1476,7 +1476,7 @@ rcu_torture_shutdown(void *arg)   * Execute random CPU-hotplug operations at the interval specified   * by the onoff_interval.   */ -static int __cpuinit +static int  rcu_torture_onoff(void *arg)  {  	int cpu; @@ -1558,7 +1558,7 @@ rcu_torture_onoff(void *arg)  	return 0;  } -static int __cpuinit +static int  rcu_torture_onoff_init(void)  {  	int ret; @@ -1601,7 +1601,7 @@ static void rcu_torture_onoff_cleanup(void)   * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then   * induces a CPU stall for the time specified by stall_cpu.   */ -static int __cpuinit rcu_torture_stall(void *args) +static int rcu_torture_stall(void *args)  {  	unsigned long stop_at; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e08abb9461ac..068de3a93606 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2910,7 +2910,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)   * can accept some slop in the rsp->completed access due to the fact   * that this CPU cannot possibly have any RCU callbacks in flight yet.   */ -static void __cpuinit +static void  rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  {  	unsigned long flags; @@ -2962,7 +2962,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  	mutex_unlock(&rsp->onoff_mutex);  } -static void __cpuinit rcu_prepare_cpu(int cpu) +static void rcu_prepare_cpu(int cpu)  {  	struct rcu_state *rsp; @@ -2974,7 +2974,7 @@ static void __cpuinit rcu_prepare_cpu(int cpu)  /*   * Handle CPU online/offline notification events.   */ -static int __cpuinit rcu_cpu_notify(struct notifier_block *self, +static int rcu_cpu_notify(struct notifier_block *self,  				    unsigned long action, void *hcpu)  {  	long cpu = (long)hcpu; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4a39d364493c..b3832581043c 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -521,10 +521,10 @@ static void invoke_rcu_callbacks_kthread(void);  static bool rcu_is_callbacks_kthread(void);  #ifdef CONFIG_RCU_BOOST  static void rcu_preempt_do_callbacks(void); -static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, +static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,  						 struct rcu_node *rnp);  #endif /* #ifdef CONFIG_RCU_BOOST */ -static void __cpuinit rcu_prepare_kthreads(int cpu); +static void rcu_prepare_kthreads(int cpu);  static void rcu_cleanup_after_idle(int cpu);  static void rcu_prepare_for_idle(int cpu);  static void rcu_idle_count_callbacks_posted(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 63098a59216e..769e12e3151b 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1352,7 +1352,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)   * already exist.  We only create this kthread for preemptible RCU.   * Returns zero if all is well, a negated errno otherwise.   */ -static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, +static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,  						 struct rcu_node *rnp)  {  	int rnp_index = rnp - &rsp->node[0]; @@ -1507,7 +1507,7 @@ static int __init rcu_spawn_kthreads(void)  }  early_initcall(rcu_spawn_kthreads); -static void __cpuinit rcu_prepare_kthreads(int cpu) +static void rcu_prepare_kthreads(int cpu)  {  	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);  	struct rcu_node *rnp = rdp->mynode; @@ -1549,7 +1549,7 @@ static int __init rcu_scheduler_really_started(void)  }  early_initcall(rcu_scheduler_really_started); -static void __cpuinit rcu_prepare_kthreads(int cpu) +static void rcu_prepare_kthreads(int cpu)  {  } diff --git a/kernel/reboot.c b/kernel/reboot.c new file mode 100644 index 000000000000..269ed9384cc4 --- /dev/null +++ b/kernel/reboot.c @@ -0,0 +1,419 @@ +/* + *  linux/kernel/reboot.c + * + *  Copyright (C) 2013  Linus Torvalds + */ + +#define pr_fmt(fmt)	"reboot: " fmt + +#include <linux/ctype.h> +#include <linux/export.h> +#include <linux/kexec.h> +#include <linux/kmod.h> +#include <linux/kmsg_dump.h> +#include <linux/reboot.h> +#include <linux/suspend.h> +#include <linux/syscalls.h> +#include <linux/syscore_ops.h> +#include <linux/uaccess.h> + +/* + * this indicates whether you can reboot with ctrl-alt-del: the default is yes + */ + +int C_A_D = 1; +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); + +#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32) +#define DEFAULT_REBOOT_MODE		= REBOOT_HARD +#else +#define DEFAULT_REBOOT_MODE +#endif +enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; + +int reboot_default; +int reboot_cpu; +enum reboot_type reboot_type = BOOT_ACPI; +int reboot_force; + +/* + * If set, this is used for preparing the system to power off. + */ + +void (*pm_power_off_prepare)(void); + +/** + *	emergency_restart - reboot the system + * + *	Without shutting down any hardware or taking any locks + *	reboot the system.  This is called when we know we are in + *	trouble so this is our best effort to reboot.  This is + *	safe to call in interrupt context. + */ +void emergency_restart(void) +{ +	kmsg_dump(KMSG_DUMP_EMERG); +	machine_emergency_restart(); +} +EXPORT_SYMBOL_GPL(emergency_restart); + +void kernel_restart_prepare(char *cmd) +{ +	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); +	system_state = SYSTEM_RESTART; +	usermodehelper_disable(); +	device_shutdown(); +} + +/** + *	register_reboot_notifier - Register function to be called at reboot time + *	@nb: Info about notifier function to be called + * + *	Registers a function with the list of functions + *	to be called at reboot time. + * + *	Currently always returns zero, as blocking_notifier_chain_register() + *	always returns zero. + */ +int register_reboot_notifier(struct notifier_block *nb) +{ +	return blocking_notifier_chain_register(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(register_reboot_notifier); + +/** + *	unregister_reboot_notifier - Unregister previously registered reboot notifier + *	@nb: Hook to be unregistered + * + *	Unregisters a previously registered reboot + *	notifier function. + * + *	Returns zero on success, or %-ENOENT on failure. + */ +int unregister_reboot_notifier(struct notifier_block *nb) +{ +	return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(unregister_reboot_notifier); + +static void migrate_to_reboot_cpu(void) +{ +	/* The boot cpu is always logical cpu 0 */ +	int cpu = reboot_cpu; + +	cpu_hotplug_disable(); + +	/* Make certain the cpu I'm about to reboot on is online */ +	if (!cpu_online(cpu)) +		cpu = cpumask_first(cpu_online_mask); + +	/* Prevent races with other tasks migrating this task */ +	current->flags |= PF_NO_SETAFFINITY; + +	/* Make certain I only run on the appropriate processor */ +	set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + +/** + *	kernel_restart - reboot the system + *	@cmd: pointer to buffer containing command to execute for restart + *		or %NULL + * + *	Shutdown everything and perform a clean reboot. + *	This is not safe to call in interrupt context. + */ +void kernel_restart(char *cmd) +{ +	kernel_restart_prepare(cmd); +	migrate_to_reboot_cpu(); +	syscore_shutdown(); +	if (!cmd) +		pr_emerg("Restarting system\n"); +	else +		pr_emerg("Restarting system with command '%s'\n", cmd); +	kmsg_dump(KMSG_DUMP_RESTART); +	machine_restart(cmd); +} +EXPORT_SYMBOL_GPL(kernel_restart); + +static void kernel_shutdown_prepare(enum system_states state) +{ +	blocking_notifier_call_chain(&reboot_notifier_list, +		(state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); +	system_state = state; +	usermodehelper_disable(); +	device_shutdown(); +} +/** + *	kernel_halt - halt the system + * + *	Shutdown everything and perform a clean system halt. + */ +void kernel_halt(void) +{ +	kernel_shutdown_prepare(SYSTEM_HALT); +	migrate_to_reboot_cpu(); +	syscore_shutdown(); +	pr_emerg("System halted\n"); +	kmsg_dump(KMSG_DUMP_HALT); +	machine_halt(); +} +EXPORT_SYMBOL_GPL(kernel_halt); + +/** + *	kernel_power_off - power_off the system + * + *	Shutdown everything and perform a clean system power_off. + */ +void kernel_power_off(void) +{ +	kernel_shutdown_prepare(SYSTEM_POWER_OFF); +	if (pm_power_off_prepare) +		pm_power_off_prepare(); +	migrate_to_reboot_cpu(); +	syscore_shutdown(); +	pr_emerg("Power down\n"); +	kmsg_dump(KMSG_DUMP_POWEROFF); +	machine_power_off(); +} +EXPORT_SYMBOL_GPL(kernel_power_off); + +static DEFINE_MUTEX(reboot_mutex); + +/* + * Reboot system call: for obvious reasons only root may call it, + * and even root needs to set up some magic numbers in the registers + * so that some mistake won't make this reboot the whole machine. + * You can also set the meaning of the ctrl-alt-del-key here. + * + * reboot doesn't sync: do that yourself before calling this. + */ +SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, +		void __user *, arg) +{ +	struct pid_namespace *pid_ns = task_active_pid_ns(current); +	char buffer[256]; +	int ret = 0; + +	/* We only trust the superuser with rebooting the system. */ +	if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) +		return -EPERM; + +	/* For safety, we require "magic" arguments. */ +	if (magic1 != LINUX_REBOOT_MAGIC1 || +			(magic2 != LINUX_REBOOT_MAGIC2 && +			magic2 != LINUX_REBOOT_MAGIC2A && +			magic2 != LINUX_REBOOT_MAGIC2B && +			magic2 != LINUX_REBOOT_MAGIC2C)) +		return -EINVAL; + +	/* +	 * If pid namespaces are enabled and the current task is in a child +	 * pid_namespace, the command is handled by reboot_pid_ns() which will +	 * call do_exit(). +	 */ +	ret = reboot_pid_ns(pid_ns, cmd); +	if (ret) +		return ret; + +	/* Instead of trying to make the power_off code look like +	 * halt when pm_power_off is not set do it the easy way. +	 */ +	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) +		cmd = LINUX_REBOOT_CMD_HALT; + +	mutex_lock(&reboot_mutex); +	switch (cmd) { +	case LINUX_REBOOT_CMD_RESTART: +		kernel_restart(NULL); +		break; + +	case LINUX_REBOOT_CMD_CAD_ON: +		C_A_D = 1; +		break; + +	case LINUX_REBOOT_CMD_CAD_OFF: +		C_A_D = 0; +		break; + +	case LINUX_REBOOT_CMD_HALT: +		kernel_halt(); +		do_exit(0); +		panic("cannot halt"); + +	case LINUX_REBOOT_CMD_POWER_OFF: +		kernel_power_off(); +		do_exit(0); +		break; + +	case LINUX_REBOOT_CMD_RESTART2: +		ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1); +		if (ret < 0) { +			ret = -EFAULT; +			break; +		} +		buffer[sizeof(buffer) - 1] = '\0'; + +		kernel_restart(buffer); +		break; + +#ifdef CONFIG_KEXEC +	case LINUX_REBOOT_CMD_KEXEC: +		ret = kernel_kexec(); +		break; +#endif + +#ifdef CONFIG_HIBERNATION +	case LINUX_REBOOT_CMD_SW_SUSPEND: +		ret = hibernate(); +		break; +#endif + +	default: +		ret = -EINVAL; +		break; +	} +	mutex_unlock(&reboot_mutex); +	return ret; +} + +static void deferred_cad(struct work_struct *dummy) +{ +	kernel_restart(NULL); +} + +/* + * This function gets called by ctrl-alt-del - ie the keyboard interrupt. + * As it's called within an interrupt, it may NOT sync: the only choice + * is whether to reboot at once, or just ignore the ctrl-alt-del. + */ +void ctrl_alt_del(void) +{ +	static DECLARE_WORK(cad_work, deferred_cad); + +	if (C_A_D) +		schedule_work(&cad_work); +	else +		kill_cad_pid(SIGINT, 1); +} + +char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; + +static int __orderly_poweroff(bool force) +{ +	char **argv; +	static char *envp[] = { +		"HOME=/", +		"PATH=/sbin:/bin:/usr/sbin:/usr/bin", +		NULL +	}; +	int ret; + +	argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); +	if (argv) { +		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); +		argv_free(argv); +	} else { +		ret = -ENOMEM; +	} + +	if (ret && force) { +		pr_warn("Failed to start orderly shutdown: forcing the issue\n"); +		/* +		 * I guess this should try to kick off some daemon to sync and +		 * poweroff asap.  Or not even bother syncing if we're doing an +		 * emergency shutdown? +		 */ +		emergency_sync(); +		kernel_power_off(); +	} + +	return ret; +} + +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ +	__orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ +	if (force) /* do not override the pending "true" */ +		poweroff_force = true; +	schedule_work(&poweroff_work); +	return 0; +} +EXPORT_SYMBOL_GPL(orderly_poweroff); + +static int __init reboot_setup(char *str) +{ +	for (;;) { +		/* +		 * Having anything passed on the command line via +		 * reboot= will cause us to disable DMI checking +		 * below. +		 */ +		reboot_default = 0; + +		switch (*str) { +		case 'w': +			reboot_mode = REBOOT_WARM; +			break; + +		case 'c': +			reboot_mode = REBOOT_COLD; +			break; + +		case 'h': +			reboot_mode = REBOOT_HARD; +			break; + +		case 's': +			if (isdigit(*(str+1))) +				reboot_cpu = simple_strtoul(str+1, NULL, 0); +			else if (str[1] == 'm' && str[2] == 'p' && +							isdigit(*(str+3))) +				reboot_cpu = simple_strtoul(str+3, NULL, 0); +			else +				reboot_mode = REBOOT_SOFT; +			break; + +		case 'g': +			reboot_mode = REBOOT_GPIO; +			break; + +		case 'b': +		case 'a': +		case 'k': +		case 't': +		case 'e': +		case 'p': +			reboot_type = *str; +			break; + +		case 'f': +			reboot_force = 1; +			break; +		} + +		str = strchr(str, ','); +		if (str) +			str++; +		else +			break; +	} +	return 1; +} +__setup("reboot=", reboot_setup); diff --git a/kernel/relay.c b/kernel/relay.c index b91488ba2e5a..5001c9887db1 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -516,7 +516,7 @@ static void setup_callbacks(struct rchan *chan,   *   * 	Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)   */ -static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, +static int relay_hotcpu_callback(struct notifier_block *nb,  				unsigned long action,  				void *hcpu)  { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9b1f2e533b95..b7c32cb7bfeb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -370,13 +370,6 @@ static struct rq *this_rq_lock(void)  #ifdef CONFIG_SCHED_HRTICK  /*   * Use HR-timers to deliver accurate preemption points. - * - * Its all a bit involved since we cannot program an hrt while holding the - * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a - * reschedule event. - * - * When we get rescheduled we reprogram the hrtick_timer outside of the - * rq->lock.   */  static void hrtick_clear(struct rq *rq) @@ -404,6 +397,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)  }  #ifdef CONFIG_SMP + +static int __hrtick_restart(struct rq *rq) +{ +	struct hrtimer *timer = &rq->hrtick_timer; +	ktime_t time = hrtimer_get_softexpires(timer); + +	return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); +} +  /*   * called from hardirq (IPI) context   */ @@ -412,7 +414,7 @@ static void __hrtick_start(void *arg)  	struct rq *rq = arg;  	raw_spin_lock(&rq->lock); -	hrtimer_restart(&rq->hrtick_timer); +	__hrtick_restart(rq);  	rq->hrtick_csd_pending = 0;  	raw_spin_unlock(&rq->lock);  } @@ -430,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay)  	hrtimer_set_expires(timer, time);  	if (rq == this_rq()) { -		hrtimer_restart(timer); +		__hrtick_restart(rq);  	} else if (!rq->hrtick_csd_pending) {  		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);  		rq->hrtick_csd_pending = 1; @@ -4131,7 +4133,7 @@ void show_state_filter(unsigned long state_filter)  		debug_show_all_locks();  } -void __cpuinit init_idle_bootup_task(struct task_struct *idle) +void init_idle_bootup_task(struct task_struct *idle)  {  	idle->sched_class = &idle_sched_class;  } @@ -4144,7 +4146,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)   * NOTE: this function does not set the idle thread's NEED_RESCHED   * flag, to make booting more robust.   */ -void __cpuinit init_idle(struct task_struct *idle, int cpu) +void init_idle(struct task_struct *idle, int cpu)  {  	struct rq *rq = cpu_rq(cpu);  	unsigned long flags; @@ -4628,7 +4630,7 @@ static void set_rq_offline(struct rq *rq)   * migration_call - callback that gets triggered when a CPU is added.   * Here we can start up the necessary migration thread for the new CPU.   */ -static int __cpuinit +static int  migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  {  	int cpu = (long)hcpu; @@ -4682,12 +4684,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)   * happens before everything else.  This has to be lower priority than   * the notifier in the perf_event subsystem, though.   */ -static struct notifier_block __cpuinitdata migration_notifier = { +static struct notifier_block migration_notifier = {  	.notifier_call = migration_call,  	.priority = CPU_PRI_MIGRATION,  }; -static int __cpuinit sched_cpu_active(struct notifier_block *nfb, +static int sched_cpu_active(struct notifier_block *nfb,  				      unsigned long action, void *hcpu)  {  	switch (action & ~CPU_TASKS_FROZEN) { @@ -4700,7 +4702,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,  	}  } -static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, +static int sched_cpu_inactive(struct notifier_block *nfb,  					unsigned long action, void *hcpu)  {  	switch (action & ~CPU_TASKS_FROZEN) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f77f9c527449..9565645e3202 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -851,7 +851,7 @@ void task_numa_fault(int node, int pages, bool migrated)  {  	struct task_struct *p = current; -	if (!sched_feat_numa(NUMA)) +	if (!numabalancing_enabled)  		return;  	/* FIXME: Allocate task-specific structure for placement policy here */ @@ -5506,7 +5506,7 @@ void nohz_balance_enter_idle(int cpu)  	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));  } -static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, +static int sched_ilb_notifier(struct notifier_block *nfb,  					unsigned long action, void *hcpu)  {  	switch (action & ~CPU_TASKS_FROZEN) { @@ -5786,7 +5786,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)  		entity_tick(cfs_rq, se, queued);  	} -	if (sched_feat_numa(NUMA)) +	if (numabalancing_enabled)  		task_tick_numa(rq, curr);  	update_rq_runnable_avg(rq, 1); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 17d7065c3872..5aef494fc8b4 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -162,6 +162,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)   */  /** + * cputimer_running - return true if cputimer is running + * + * @tsk:	Pointer to target task. + */ +static inline bool cputimer_running(struct task_struct *tsk) + +{ +	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + +	if (!cputimer->running) +		return false; + +	/* +	 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime +	 * in __exit_signal(), we won't account to the signal struct further +	 * cputime consumed by that task, even though the task can still be +	 * ticking after __exit_signal(). +	 * +	 * In order to keep a consistent behaviour between thread group cputime +	 * and thread group cputimer accounting, lets also ignore the cputime +	 * elapsing after __exit_signal() in any thread group timer running. +	 * +	 * This makes sure that POSIX CPU clocks and timers are synchronized, so +	 * that a POSIX CPU timer won't expire while the corresponding POSIX CPU +	 * clock delta is behind the expiring timer value. +	 */ +	if (unlikely(!tsk->sighand)) +		return false; + +	return true; +} + +/**   * account_group_user_time - Maintain utime for a thread group.   *   * @tsk:	Pointer to task structure. @@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk,  {  	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; -	if (!cputimer->running) +	if (!cputimer_running(tsk))  		return;  	raw_spin_lock(&cputimer->lock); @@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk,  {  	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; -	if (!cputimer->running) +	if (!cputimer_running(tsk))  		return;  	raw_spin_lock(&cputimer->lock); @@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,  {  	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; -	if (!cputimer->running) +	if (!cputimer_running(tsk))  		return;  	raw_spin_lock(&cputimer->lock); diff --git a/kernel/smp.c b/kernel/smp.c index 4dba0f7b72ad..fe9f773d7114 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -73,7 +73,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)  	return NOTIFY_OK;  } -static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { +static struct notifier_block hotplug_cfd_notifier = {  	.notifier_call		= hotplug_cfd,  }; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 02fc5c933673..eb89e1807408 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -24,7 +24,7 @@   */  static DEFINE_PER_CPU(struct task_struct *, idle_threads); -struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) +struct task_struct *idle_thread_get(unsigned int cpu)  {  	struct task_struct *tsk = per_cpu(idle_threads, cpu); diff --git a/kernel/softirq.c b/kernel/softirq.c index ca25e6e704a2..be3d3514c325 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -699,7 +699,7 @@ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)  }  EXPORT_SYMBOL(send_remote_softirq); -static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, +static int remote_softirq_cpu_notify(struct notifier_block *self,  					       unsigned long action, void *hcpu)  {  	/* @@ -728,7 +728,7 @@ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,  	return NOTIFY_OK;  } -static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { +static struct notifier_block remote_softirq_cpu_notifier = {  	.notifier_call	= remote_softirq_cpu_notify,  }; @@ -830,7 +830,7 @@ static void takeover_tasklets(unsigned int cpu)  }  #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit cpu_callback(struct notifier_block *nfb, +static int cpu_callback(struct notifier_block *nfb,  				  unsigned long action,  				  void *hcpu)  { @@ -845,7 +845,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,  	return NOTIFY_OK;  } -static struct notifier_block __cpuinitdata cpu_nfb = { +static struct notifier_block cpu_nfb = {  	.notifier_call = cpu_callback  }; diff --git a/kernel/sys.c b/kernel/sys.c index 071de900c824..771129b299f8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -116,20 +116,6 @@ EXPORT_SYMBOL(fs_overflowuid);  EXPORT_SYMBOL(fs_overflowgid);  /* - * this indicates whether you can reboot with ctrl-alt-del: the default is yes - */ - -int C_A_D = 1; -struct pid *cad_pid; -EXPORT_SYMBOL(cad_pid); - -/* - * If set, this is used for preparing the system to power off. - */ - -void (*pm_power_off_prepare)(void); - -/*   * Returns true if current's euid is same as p's uid or euid,   * or has CAP_SYS_NICE to p's user_ns.   * @@ -308,266 +294,6 @@ out_unlock:  	return retval;  } -/** - *	emergency_restart - reboot the system - * - *	Without shutting down any hardware or taking any locks - *	reboot the system.  This is called when we know we are in - *	trouble so this is our best effort to reboot.  This is - *	safe to call in interrupt context. - */ -void emergency_restart(void) -{ -	kmsg_dump(KMSG_DUMP_EMERG); -	machine_emergency_restart(); -} -EXPORT_SYMBOL_GPL(emergency_restart); - -void kernel_restart_prepare(char *cmd) -{ -	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); -	system_state = SYSTEM_RESTART; -	usermodehelper_disable(); -	device_shutdown(); -} - -/** - *	register_reboot_notifier - Register function to be called at reboot time - *	@nb: Info about notifier function to be called - * - *	Registers a function with the list of functions - *	to be called at reboot time. - * - *	Currently always returns zero, as blocking_notifier_chain_register() - *	always returns zero. - */ -int register_reboot_notifier(struct notifier_block *nb) -{ -	return blocking_notifier_chain_register(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(register_reboot_notifier); - -/** - *	unregister_reboot_notifier - Unregister previously registered reboot notifier - *	@nb: Hook to be unregistered - * - *	Unregisters a previously registered reboot - *	notifier function. - * - *	Returns zero on success, or %-ENOENT on failure. - */ -int unregister_reboot_notifier(struct notifier_block *nb) -{ -	return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); -} -EXPORT_SYMBOL(unregister_reboot_notifier); - -/* Add backwards compatibility for stable trees. */ -#ifndef PF_NO_SETAFFINITY -#define PF_NO_SETAFFINITY		PF_THREAD_BOUND -#endif - -static void migrate_to_reboot_cpu(void) -{ -	/* The boot cpu is always logical cpu 0 */ -	int cpu = 0; - -	cpu_hotplug_disable(); - -	/* Make certain the cpu I'm about to reboot on is online */ -	if (!cpu_online(cpu)) -		cpu = cpumask_first(cpu_online_mask); - -	/* Prevent races with other tasks migrating this task */ -	current->flags |= PF_NO_SETAFFINITY; - -	/* Make certain I only run on the appropriate processor */ -	set_cpus_allowed_ptr(current, cpumask_of(cpu)); -} - -/** - *	kernel_restart - reboot the system - *	@cmd: pointer to buffer containing command to execute for restart - *		or %NULL - * - *	Shutdown everything and perform a clean reboot. - *	This is not safe to call in interrupt context. - */ -void kernel_restart(char *cmd) -{ -	kernel_restart_prepare(cmd); -	migrate_to_reboot_cpu(); -	syscore_shutdown(); -	if (!cmd) -		printk(KERN_EMERG "Restarting system.\n"); -	else -		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); -	kmsg_dump(KMSG_DUMP_RESTART); -	machine_restart(cmd); -} -EXPORT_SYMBOL_GPL(kernel_restart); - -static void kernel_shutdown_prepare(enum system_states state) -{ -	blocking_notifier_call_chain(&reboot_notifier_list, -		(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); -	system_state = state; -	usermodehelper_disable(); -	device_shutdown(); -} -/** - *	kernel_halt - halt the system - * - *	Shutdown everything and perform a clean system halt. - */ -void kernel_halt(void) -{ -	kernel_shutdown_prepare(SYSTEM_HALT); -	migrate_to_reboot_cpu(); -	syscore_shutdown(); -	printk(KERN_EMERG "System halted.\n"); -	kmsg_dump(KMSG_DUMP_HALT); -	machine_halt(); -} - -EXPORT_SYMBOL_GPL(kernel_halt); - -/** - *	kernel_power_off - power_off the system - * - *	Shutdown everything and perform a clean system power_off. - */ -void kernel_power_off(void) -{ -	kernel_shutdown_prepare(SYSTEM_POWER_OFF); -	if (pm_power_off_prepare) -		pm_power_off_prepare(); -	migrate_to_reboot_cpu(); -	syscore_shutdown(); -	printk(KERN_EMERG "Power down.\n"); -	kmsg_dump(KMSG_DUMP_POWEROFF); -	machine_power_off(); -} -EXPORT_SYMBOL_GPL(kernel_power_off); - -static DEFINE_MUTEX(reboot_mutex); - -/* - * Reboot system call: for obvious reasons only root may call it, - * and even root needs to set up some magic numbers in the registers - * so that some mistake won't make this reboot the whole machine. - * You can also set the meaning of the ctrl-alt-del-key here. - * - * reboot doesn't sync: do that yourself before calling this. - */ -SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, -		void __user *, arg) -{ -	struct pid_namespace *pid_ns = task_active_pid_ns(current); -	char buffer[256]; -	int ret = 0; - -	/* We only trust the superuser with rebooting the system. */ -	if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT)) -		return -EPERM; - -	/* For safety, we require "magic" arguments. */ -	if (magic1 != LINUX_REBOOT_MAGIC1 || -	    (magic2 != LINUX_REBOOT_MAGIC2 && -	                magic2 != LINUX_REBOOT_MAGIC2A && -			magic2 != LINUX_REBOOT_MAGIC2B && -	                magic2 != LINUX_REBOOT_MAGIC2C)) -		return -EINVAL; - -	/* -	 * If pid namespaces are enabled and the current task is in a child -	 * pid_namespace, the command is handled by reboot_pid_ns() which will -	 * call do_exit(). -	 */ -	ret = reboot_pid_ns(pid_ns, cmd); -	if (ret) -		return ret; - -	/* Instead of trying to make the power_off code look like -	 * halt when pm_power_off is not set do it the easy way. -	 */ -	if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) -		cmd = LINUX_REBOOT_CMD_HALT; - -	mutex_lock(&reboot_mutex); -	switch (cmd) { -	case LINUX_REBOOT_CMD_RESTART: -		kernel_restart(NULL); -		break; - -	case LINUX_REBOOT_CMD_CAD_ON: -		C_A_D = 1; -		break; - -	case LINUX_REBOOT_CMD_CAD_OFF: -		C_A_D = 0; -		break; - -	case LINUX_REBOOT_CMD_HALT: -		kernel_halt(); -		do_exit(0); -		panic("cannot halt.\n"); - -	case LINUX_REBOOT_CMD_POWER_OFF: -		kernel_power_off(); -		do_exit(0); -		break; - -	case LINUX_REBOOT_CMD_RESTART2: -		if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { -			ret = -EFAULT; -			break; -		} -		buffer[sizeof(buffer) - 1] = '\0'; - -		kernel_restart(buffer); -		break; - -#ifdef CONFIG_KEXEC -	case LINUX_REBOOT_CMD_KEXEC: -		ret = kernel_kexec(); -		break; -#endif - -#ifdef CONFIG_HIBERNATION -	case LINUX_REBOOT_CMD_SW_SUSPEND: -		ret = hibernate(); -		break; -#endif - -	default: -		ret = -EINVAL; -		break; -	} -	mutex_unlock(&reboot_mutex); -	return ret; -} - -static void deferred_cad(struct work_struct *dummy) -{ -	kernel_restart(NULL); -} - -/* - * This function gets called by ctrl-alt-del - ie the keyboard interrupt. - * As it's called within an interrupt, it may NOT sync: the only choice - * is whether to reboot at once, or just ignore the ctrl-alt-del. - */ -void ctrl_alt_del(void) -{ -	static DECLARE_WORK(cad_work, deferred_cad); - -	if (C_A_D) -		schedule_work(&cad_work); -	else -		kill_cad_pid(SIGINT, 1); -} -	  /*   * Unprivileged users may change the real gid to the effective gid   * or vice versa.  (BSD-style) @@ -2292,68 +2018,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,  	return err ? -EFAULT : 0;  } -char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; - -static int __orderly_poweroff(bool force) -{ -	char **argv; -	static char *envp[] = { -		"HOME=/", -		"PATH=/sbin:/bin:/usr/sbin:/usr/bin", -		NULL -	}; -	int ret; - -	argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); -	if (argv) { -		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); -		argv_free(argv); -	} else { -		printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", -					 __func__, poweroff_cmd); -		ret = -ENOMEM; -	} - -	if (ret && force) { -		printk(KERN_WARNING "Failed to start orderly shutdown: " -					"forcing the issue\n"); -		/* -		 * I guess this should try to kick off some daemon to sync and -		 * poweroff asap.  Or not even bother syncing if we're doing an -		 * emergency shutdown? -		 */ -		emergency_sync(); -		kernel_power_off(); -	} - -	return ret; -} - -static bool poweroff_force; - -static void poweroff_work_func(struct work_struct *work) -{ -	__orderly_poweroff(poweroff_force); -} - -static DECLARE_WORK(poweroff_work, poweroff_work_func); - -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) -{ -	if (force) /* do not override the pending "true" */ -		poweroff_force = true; -	schedule_work(&poweroff_work); -	return 0; -} -EXPORT_SYMBOL_GPL(orderly_poweroff); -  /**   * do_sysinfo - fill in sysinfo struct   * @info: pointer to buffer to fill diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4ce13c3cedb9..07f6fc468e17 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -599,6 +599,13 @@ static struct ctl_table kern_table[] = {  		.mode		= 0644,  		.proc_handler	= proc_dointvec,  	}, +	{ +		.procname	= "traceoff_on_warning", +		.data		= &__disable_trace_on_warning, +		.maxlen		= sizeof(__disable_trace_on_warning), +		.mode		= 0644, +		.proc_handler	= proc_dointvec, +	},  #endif  #ifdef CONFIG_MODULES  	{ @@ -800,7 +807,7 @@ static struct ctl_table kern_table[] = {  #if defined(CONFIG_LOCKUP_DETECTOR)  	{  		.procname       = "watchdog", -		.data           = &watchdog_enabled, +		.data           = &watchdog_user_enabled,  		.maxlen         = sizeof (int),  		.mode           = 0644,  		.proc_handler   = proc_dowatchdog, @@ -827,7 +834,7 @@ static struct ctl_table kern_table[] = {  	},  	{  		.procname       = "nmi_watchdog", -		.data           = &watchdog_enabled, +		.data           = &watchdog_user_enabled,  		.maxlen         = sizeof (int),  		.mode           = 0644,  		.proc_handler   = proc_dowatchdog, @@ -2339,7 +2346,11 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,  					    int write, void *data)  {  	if (write) { -		*valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); +		unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); + +		if (jif > INT_MAX) +			return 1; +		*valp = (int)jif;  	} else {  		int val = *valp;  		unsigned long lval; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index aea4a9ea6fc8..b609213ca9a2 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -3,7 +3,6 @@  #include "../fs/xfs/xfs_sysctl.h"  #include <linux/sunrpc/debug.h>  #include <linux/string.h> -#include <net/ip_vs.h>  #include <linux/syscalls.h>  #include <linux/namei.h>  #include <linux/mount.h> diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ff7d9d2ab504..9250130646f5 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -4,6 +4,8 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o  obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o  obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o  obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= tick-broadcast.o +obj-$(CONFIG_GENERIC_SCHED_CLOCK)		+= sched_clock.o  obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o  obj-$(CONFIG_TICK_ONESHOT)			+= tick-sched.o  obj-$(CONFIG_TIMER_STATS)			+= timer_stats.o +obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f11d83b12949..eec50fcef9e4 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -199,6 +199,13 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)  } +ktime_t alarm_expires_remaining(const struct alarm *alarm) +{ +	struct alarm_base *base = &alarm_bases[alarm->type]; +	return ktime_sub(alarm->node.expires, base->gettime()); +} +EXPORT_SYMBOL_GPL(alarm_expires_remaining); +  #ifdef CONFIG_RTC_CLASS  /**   * alarmtimer_suspend - Suspend time callback @@ -303,9 +310,10 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,  	alarm->type = type;  	alarm->state = ALARMTIMER_STATE_INACTIVE;  } +EXPORT_SYMBOL_GPL(alarm_init);  /** - * alarm_start - Sets an alarm to fire + * alarm_start - Sets an absolute alarm to fire   * @alarm: ptr to alarm to set   * @start: time to run the alarm   */ @@ -323,6 +331,34 @@ int alarm_start(struct alarm *alarm, ktime_t start)  	spin_unlock_irqrestore(&base->lock, flags);  	return ret;  } +EXPORT_SYMBOL_GPL(alarm_start); + +/** + * alarm_start_relative - Sets a relative alarm to fire + * @alarm: ptr to alarm to set + * @start: time relative to now to run the alarm + */ +int alarm_start_relative(struct alarm *alarm, ktime_t start) +{ +	struct alarm_base *base = &alarm_bases[alarm->type]; + +	start = ktime_add(start, base->gettime()); +	return alarm_start(alarm, start); +} +EXPORT_SYMBOL_GPL(alarm_start_relative); + +void alarm_restart(struct alarm *alarm) +{ +	struct alarm_base *base = &alarm_bases[alarm->type]; +	unsigned long flags; + +	spin_lock_irqsave(&base->lock, flags); +	hrtimer_set_expires(&alarm->timer, alarm->node.expires); +	hrtimer_restart(&alarm->timer); +	alarmtimer_enqueue(base, alarm); +	spin_unlock_irqrestore(&base->lock, flags); +} +EXPORT_SYMBOL_GPL(alarm_restart);  /**   * alarm_try_to_cancel - Tries to cancel an alarm timer @@ -344,6 +380,7 @@ int alarm_try_to_cancel(struct alarm *alarm)  	spin_unlock_irqrestore(&base->lock, flags);  	return ret;  } +EXPORT_SYMBOL_GPL(alarm_try_to_cancel);  /** @@ -361,6 +398,7 @@ int alarm_cancel(struct alarm *alarm)  		cpu_relax();  	}  } +EXPORT_SYMBOL_GPL(alarm_cancel);  u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) @@ -393,8 +431,15 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)  	alarm->node.expires = ktime_add(alarm->node.expires, interval);  	return overrun;  } +EXPORT_SYMBOL_GPL(alarm_forward); +u64 alarm_forward_now(struct alarm *alarm, ktime_t interval) +{ +	struct alarm_base *base = &alarm_bases[alarm->type]; +	return alarm_forward(alarm, base->gettime(), interval); +} +EXPORT_SYMBOL_GPL(alarm_forward_now);  /** diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index c6d6400ee137..38959c866789 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -15,20 +15,23 @@  #include <linux/hrtimer.h>  #include <linux/init.h>  #include <linux/module.h> -#include <linux/notifier.h>  #include <linux/smp.h> +#include <linux/device.h>  #include "tick-internal.h"  /* The registered clock event devices */  static LIST_HEAD(clockevent_devices);  static LIST_HEAD(clockevents_released); - -/* Notification for clock events */ -static RAW_NOTIFIER_HEAD(clockevents_chain); -  /* Protection for the above */  static DEFINE_RAW_SPINLOCK(clockevents_lock); +/* Protection for unbind operations */ +static DEFINE_MUTEX(clockevents_mutex); + +struct ce_unbind { +	struct clock_event_device *ce; +	int res; +};  /**   * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds @@ -232,47 +235,107 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,  	return (rc && force) ? clockevents_program_min_delta(dev) : rc;  } -/** - * clockevents_register_notifier - register a clock events change listener +/* + * Called after a notify add to make devices available which were + * released from the notifier call.   */ -int clockevents_register_notifier(struct notifier_block *nb) +static void clockevents_notify_released(void)  { -	unsigned long flags; -	int ret; +	struct clock_event_device *dev; -	raw_spin_lock_irqsave(&clockevents_lock, flags); -	ret = raw_notifier_chain_register(&clockevents_chain, nb); -	raw_spin_unlock_irqrestore(&clockevents_lock, flags); +	while (!list_empty(&clockevents_released)) { +		dev = list_entry(clockevents_released.next, +				 struct clock_event_device, list); +		list_del(&dev->list); +		list_add(&dev->list, &clockevent_devices); +		tick_check_new_device(dev); +	} +} -	return ret; +/* + * Try to install a replacement clock event device + */ +static int clockevents_replace(struct clock_event_device *ced) +{ +	struct clock_event_device *dev, *newdev = NULL; + +	list_for_each_entry(dev, &clockevent_devices, list) { +		if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) +			continue; + +		if (!tick_check_replacement(newdev, dev)) +			continue; + +		if (!try_module_get(dev->owner)) +			continue; + +		if (newdev) +			module_put(newdev->owner); +		newdev = dev; +	} +	if (newdev) { +		tick_install_replacement(newdev); +		list_del_init(&ced->list); +	} +	return newdev ? 0 : -EBUSY;  }  /* - * Notify about a clock event change. Called with clockevents_lock - * held. + * Called with clockevents_mutex and clockevents_lock held   */ -static void clockevents_do_notify(unsigned long reason, void *dev) +static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)  { -	raw_notifier_call_chain(&clockevents_chain, reason, dev); +	/* Fast track. Device is unused */ +	if (ced->mode == CLOCK_EVT_MODE_UNUSED) { +		list_del_init(&ced->list); +		return 0; +	} + +	return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY;  }  /* - * Called after a notify add to make devices available which were - * released from the notifier call. + * SMP function call to unbind a device   */ -static void clockevents_notify_released(void) +static void __clockevents_unbind(void *arg)  { -	struct clock_event_device *dev; +	struct ce_unbind *cu = arg; +	int res; + +	raw_spin_lock(&clockevents_lock); +	res = __clockevents_try_unbind(cu->ce, smp_processor_id()); +	if (res == -EAGAIN) +		res = clockevents_replace(cu->ce); +	cu->res = res; +	raw_spin_unlock(&clockevents_lock); +} -	while (!list_empty(&clockevents_released)) { -		dev = list_entry(clockevents_released.next, -				 struct clock_event_device, list); -		list_del(&dev->list); -		list_add(&dev->list, &clockevent_devices); -		clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); -	} +/* + * Issues smp function call to unbind a per cpu device. Called with + * clockevents_mutex held. + */ +static int clockevents_unbind(struct clock_event_device *ced, int cpu) +{ +	struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; + +	smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); +	return cu.res;  } +/* + * Unbind a clockevents device. + */ +int clockevents_unbind_device(struct clock_event_device *ced, int cpu) +{ +	int ret; + +	mutex_lock(&clockevents_mutex); +	ret = clockevents_unbind(ced, cpu); +	mutex_unlock(&clockevents_mutex); +	return ret; +} +EXPORT_SYMBOL_GPL(clockevents_unbind); +  /**   * clockevents_register_device - register a clock event device   * @dev:	device to register @@ -290,7 +353,7 @@ void clockevents_register_device(struct clock_event_device *dev)  	raw_spin_lock_irqsave(&clockevents_lock, flags);  	list_add(&dev->list, &clockevent_devices); -	clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); +	tick_check_new_device(dev);  	clockevents_notify_released();  	raw_spin_unlock_irqrestore(&clockevents_lock, flags); @@ -386,6 +449,7 @@ void clockevents_exchange_device(struct clock_event_device *old,  	 * released list and do a notify add later.  	 */  	if (old) { +		module_put(old->owner);  		clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);  		list_del(&old->list);  		list_add(&old->list, &clockevents_released); @@ -433,10 +497,36 @@ void clockevents_notify(unsigned long reason, void *arg)  	int cpu;  	raw_spin_lock_irqsave(&clockevents_lock, flags); -	clockevents_do_notify(reason, arg);  	switch (reason) { +	case CLOCK_EVT_NOTIFY_BROADCAST_ON: +	case CLOCK_EVT_NOTIFY_BROADCAST_OFF: +	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: +		tick_broadcast_on_off(reason, arg); +		break; + +	case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: +	case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: +		tick_broadcast_oneshot_control(reason); +		break; + +	case CLOCK_EVT_NOTIFY_CPU_DYING: +		tick_handover_do_timer(arg); +		break; + +	case CLOCK_EVT_NOTIFY_SUSPEND: +		tick_suspend(); +		tick_suspend_broadcast(); +		break; + +	case CLOCK_EVT_NOTIFY_RESUME: +		tick_resume(); +		break; +  	case CLOCK_EVT_NOTIFY_CPU_DEAD: +		tick_shutdown_broadcast_oneshot(arg); +		tick_shutdown_broadcast(arg); +		tick_shutdown(arg);  		/*  		 * Unregister the clock event devices which were  		 * released from the users in the notify chain. @@ -462,4 +552,123 @@ void clockevents_notify(unsigned long reason, void *arg)  	raw_spin_unlock_irqrestore(&clockevents_lock, flags);  }  EXPORT_SYMBOL_GPL(clockevents_notify); + +#ifdef CONFIG_SYSFS +struct bus_type clockevents_subsys = { +	.name		= "clockevents", +	.dev_name       = "clockevent", +}; + +static DEFINE_PER_CPU(struct device, tick_percpu_dev); +static struct tick_device *tick_get_tick_dev(struct device *dev); + +static ssize_t sysfs_show_current_tick_dev(struct device *dev, +					   struct device_attribute *attr, +					   char *buf) +{ +	struct tick_device *td; +	ssize_t count = 0; + +	raw_spin_lock_irq(&clockevents_lock); +	td = tick_get_tick_dev(dev); +	if (td && td->evtdev) +		count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name); +	raw_spin_unlock_irq(&clockevents_lock); +	return count; +} +static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL); + +/* We don't support the abomination of removable broadcast devices */ +static ssize_t sysfs_unbind_tick_dev(struct device *dev, +				     struct device_attribute *attr, +				     const char *buf, size_t count) +{ +	char name[CS_NAME_LEN]; +	size_t ret = sysfs_get_uname(buf, name, count); +	struct clock_event_device *ce; + +	if (ret < 0) +		return ret; + +	ret = -ENODEV; +	mutex_lock(&clockevents_mutex); +	raw_spin_lock_irq(&clockevents_lock); +	list_for_each_entry(ce, &clockevent_devices, list) { +		if (!strcmp(ce->name, name)) { +			ret = __clockevents_try_unbind(ce, dev->id); +			break; +		} +	} +	raw_spin_unlock_irq(&clockevents_lock); +	/* +	 * We hold clockevents_mutex, so ce can't go away +	 */ +	if (ret == -EAGAIN) +		ret = clockevents_unbind(ce, dev->id); +	mutex_unlock(&clockevents_mutex); +	return ret ? ret : count; +} +static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static struct device tick_bc_dev = { +	.init_name	= "broadcast", +	.id		= 0, +	.bus		= &clockevents_subsys, +}; + +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ +	return dev == &tick_bc_dev ? tick_get_broadcast_device() : +		&per_cpu(tick_cpu_device, dev->id); +} + +static __init int tick_broadcast_init_sysfs(void) +{ +	int err = device_register(&tick_bc_dev); + +	if (!err) +		err = device_create_file(&tick_bc_dev, &dev_attr_current_device); +	return err; +} +#else +static struct tick_device *tick_get_tick_dev(struct device *dev) +{ +	return &per_cpu(tick_cpu_device, dev->id); +} +static inline int tick_broadcast_init_sysfs(void) { return 0; }  #endif + +static int __init tick_init_sysfs(void) +{ +	int cpu; + +	for_each_possible_cpu(cpu) { +		struct device *dev = &per_cpu(tick_percpu_dev, cpu); +		int err; + +		dev->id = cpu; +		dev->bus = &clockevents_subsys; +		err = device_register(dev); +		if (!err) +			err = device_create_file(dev, &dev_attr_current_device); +		if (!err) +			err = device_create_file(dev, &dev_attr_unbind_device); +		if (err) +			return err; +	} +	return tick_broadcast_init_sysfs(); +} + +static int __init clockevents_init_sysfs(void) +{ +	int err = subsys_system_register(&clockevents_subsys, NULL); + +	if (!err) +		err = tick_init_sysfs(); +	return err; +} +device_initcall(clockevents_init_sysfs); +#endif /* SYSFS */ + +#endif /* GENERIC_CLOCK_EVENTS */ diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c9583382141a..50a8736757f3 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -31,6 +31,8 @@  #include <linux/tick.h>  #include <linux/kthread.h> +#include "tick-internal.h" +  void timecounter_init(struct timecounter *tc,  		      const struct cyclecounter *cc,  		      u64 start_tstamp) @@ -174,11 +176,12 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)  static struct clocksource *curr_clocksource;  static LIST_HEAD(clocksource_list);  static DEFINE_MUTEX(clocksource_mutex); -static char override_name[32]; +static char override_name[CS_NAME_LEN];  static int finished_booting;  #ifdef CONFIG_CLOCKSOURCE_WATCHDOG  static void clocksource_watchdog_work(struct work_struct *work); +static void clocksource_select(void);  static LIST_HEAD(watchdog_list);  static struct clocksource *watchdog; @@ -299,13 +302,30 @@ static void clocksource_watchdog(unsigned long data)  		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&  		    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&  		    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { +			/* Mark it valid for high-res. */  			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + +			/* +			 * clocksource_done_booting() will sort it if +			 * finished_booting is not set yet. +			 */ +			if (!finished_booting) +				continue; +  			/* -			 * We just marked the clocksource as highres-capable, -			 * notify the rest of the system as well so that we -			 * transition into high-res mode: +			 * If this is not the current clocksource let +			 * the watchdog thread reselect it. Due to the +			 * change to high res this clocksource might +			 * be preferred now. If it is the current +			 * clocksource let the tick code know about +			 * that change.  			 */ -			tick_clock_notify(); +			if (cs != curr_clocksource) { +				cs->flags |= CLOCK_SOURCE_RESELECT; +				schedule_work(&watchdog_work); +			} else { +				tick_clock_notify(); +			}  		}  	} @@ -388,44 +408,39 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)  static void clocksource_dequeue_watchdog(struct clocksource *cs)  { -	struct clocksource *tmp;  	unsigned long flags;  	spin_lock_irqsave(&watchdog_lock, flags); -	if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { -		/* cs is a watched clocksource. */ -		list_del_init(&cs->wd_list); -	} else if (cs == watchdog) { -		/* Reset watchdog cycles */ -		clocksource_reset_watchdog(); -		/* Current watchdog is removed. Find an alternative. */ -		watchdog = NULL; -		list_for_each_entry(tmp, &clocksource_list, list) { -			if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) -				continue; -			if (!watchdog || tmp->rating > watchdog->rating) -				watchdog = tmp; +	if (cs != watchdog) { +		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { +			/* cs is a watched clocksource. */ +			list_del_init(&cs->wd_list); +			/* Check if the watchdog timer needs to be stopped. */ +			clocksource_stop_watchdog();  		}  	} -	cs->flags &= ~CLOCK_SOURCE_WATCHDOG; -	/* Check if the watchdog timer needs to be stopped. */ -	clocksource_stop_watchdog();  	spin_unlock_irqrestore(&watchdog_lock, flags);  } -static int clocksource_watchdog_kthread(void *data) +static int __clocksource_watchdog_kthread(void)  {  	struct clocksource *cs, *tmp;  	unsigned long flags;  	LIST_HEAD(unstable); +	int select = 0; -	mutex_lock(&clocksource_mutex);  	spin_lock_irqsave(&watchdog_lock, flags); -	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) +	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {  		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {  			list_del_init(&cs->wd_list);  			list_add(&cs->wd_list, &unstable); +			select = 1;  		} +		if (cs->flags & CLOCK_SOURCE_RESELECT) { +			cs->flags &= ~CLOCK_SOURCE_RESELECT; +			select = 1; +		} +	}  	/* Check if the watchdog timer needs to be stopped. */  	clocksource_stop_watchdog();  	spin_unlock_irqrestore(&watchdog_lock, flags); @@ -435,10 +450,23 @@ static int clocksource_watchdog_kthread(void *data)  		list_del_init(&cs->wd_list);  		__clocksource_change_rating(cs, 0);  	} +	return select; +} + +static int clocksource_watchdog_kthread(void *data) +{ +	mutex_lock(&clocksource_mutex); +	if (__clocksource_watchdog_kthread()) +		clocksource_select();  	mutex_unlock(&clocksource_mutex);  	return 0;  } +static bool clocksource_is_watchdog(struct clocksource *cs) +{ +	return cs == watchdog; +} +  #else /* CONFIG_CLOCKSOURCE_WATCHDOG */  static void clocksource_enqueue_watchdog(struct clocksource *cs) @@ -449,7 +477,8 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)  static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }  static inline void clocksource_resume_watchdog(void) { } -static inline int clocksource_watchdog_kthread(void *data) { return 0; } +static inline int __clocksource_watchdog_kthread(void) { return 0; } +static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }  #endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ @@ -553,24 +582,42 @@ static u64 clocksource_max_deferment(struct clocksource *cs)  #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET -/** - * clocksource_select - Select the best clocksource available - * - * Private function. Must hold clocksource_mutex when called. - * - * Select the clocksource with the best rating, or the clocksource, - * which is selected by userspace override. - */ -static void clocksource_select(void) +static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)  { -	struct clocksource *best, *cs; +	struct clocksource *cs;  	if (!finished_booting || list_empty(&clocksource_list)) +		return NULL; + +	/* +	 * We pick the clocksource with the highest rating. If oneshot +	 * mode is active, we pick the highres valid clocksource with +	 * the best rating. +	 */ +	list_for_each_entry(cs, &clocksource_list, list) { +		if (skipcur && cs == curr_clocksource) +			continue; +		if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) +			continue; +		return cs; +	} +	return NULL; +} + +static void __clocksource_select(bool skipcur) +{ +	bool oneshot = tick_oneshot_mode_active(); +	struct clocksource *best, *cs; + +	/* Find the best suitable clocksource */ +	best = clocksource_find_best(oneshot, skipcur); +	if (!best)  		return; -	/* First clocksource on the list has the best rating. */ -	best = list_first_entry(&clocksource_list, struct clocksource, list); +  	/* Check for the override clocksource. */  	list_for_each_entry(cs, &clocksource_list, list) { +		if (skipcur && cs == curr_clocksource) +			continue;  		if (strcmp(cs->name, override_name) != 0)  			continue;  		/* @@ -578,8 +625,7 @@ static void clocksource_select(void)  		 * capable clocksource if the tick code is in oneshot  		 * mode (highres or nohz)  		 */ -		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && -		    tick_oneshot_mode_active()) { +		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {  			/* Override clocksource cannot be used. */  			printk(KERN_WARNING "Override clocksource %s is not "  			       "HRT compatible. Cannot switch while in " @@ -590,16 +636,35 @@ static void clocksource_select(void)  			best = cs;  		break;  	} -	if (curr_clocksource != best) { -		printk(KERN_INFO "Switching to clocksource %s\n", best->name); + +	if (curr_clocksource != best && !timekeeping_notify(best)) { +		pr_info("Switched to clocksource %s\n", best->name);  		curr_clocksource = best; -		timekeeping_notify(curr_clocksource);  	}  } +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ +	return __clocksource_select(false); +} + +static void clocksource_select_fallback(void) +{ +	return __clocksource_select(true); +} +  #else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */  static inline void clocksource_select(void) { } +static inline void clocksource_select_fallback(void) { }  #endif @@ -614,16 +679,11 @@ static int __init clocksource_done_booting(void)  {  	mutex_lock(&clocksource_mutex);  	curr_clocksource = clocksource_default_clock(); -	mutex_unlock(&clocksource_mutex); -  	finished_booting = 1; -  	/*  	 * Run the watchdog first to eliminate unstable clock sources  	 */ -	clocksource_watchdog_kthread(NULL); - -	mutex_lock(&clocksource_mutex); +	__clocksource_watchdog_kthread();  	clocksource_select();  	mutex_unlock(&clocksource_mutex);  	return 0; @@ -756,7 +816,6 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)  	list_del(&cs->list);  	cs->rating = rating;  	clocksource_enqueue(cs); -	clocksource_select();  }  /** @@ -768,21 +827,47 @@ void clocksource_change_rating(struct clocksource *cs, int rating)  {  	mutex_lock(&clocksource_mutex);  	__clocksource_change_rating(cs, rating); +	clocksource_select();  	mutex_unlock(&clocksource_mutex);  }  EXPORT_SYMBOL(clocksource_change_rating); +/* + * Unbind clocksource @cs. Called with clocksource_mutex held + */ +static int clocksource_unbind(struct clocksource *cs) +{ +	/* +	 * I really can't convince myself to support this on hardware +	 * designed by lobotomized monkeys. +	 */ +	if (clocksource_is_watchdog(cs)) +		return -EBUSY; + +	if (cs == curr_clocksource) { +		/* Select and try to install a replacement clock source */ +		clocksource_select_fallback(); +		if (curr_clocksource == cs) +			return -EBUSY; +	} +	clocksource_dequeue_watchdog(cs); +	list_del_init(&cs->list); +	return 0; +} +  /**   * clocksource_unregister - remove a registered clocksource   * @cs:	clocksource to be unregistered   */ -void clocksource_unregister(struct clocksource *cs) +int clocksource_unregister(struct clocksource *cs)  { +	int ret = 0; +  	mutex_lock(&clocksource_mutex); -	clocksource_dequeue_watchdog(cs); -	list_del(&cs->list); -	clocksource_select(); +	if (!list_empty(&cs->list)) +		ret = clocksource_unbind(cs);  	mutex_unlock(&clocksource_mutex); +	return ret;  }  EXPORT_SYMBOL(clocksource_unregister); @@ -808,6 +893,23 @@ sysfs_show_current_clocksources(struct device *dev,  	return count;  } +size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) +{ +	size_t ret = cnt; + +	/* strings from sysfs write are not 0 terminated! */ +	if (!cnt || cnt >= CS_NAME_LEN) +		return -EINVAL; + +	/* strip of \n: */ +	if (buf[cnt-1] == '\n') +		cnt--; +	if (cnt > 0) +		memcpy(dst, buf, cnt); +	dst[cnt] = 0; +	return ret; +} +  /**   * sysfs_override_clocksource - interface for manually overriding clocksource   * @dev:	unused @@ -822,22 +924,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev,  					  struct device_attribute *attr,  					  const char *buf, size_t count)  { -	size_t ret = count; - -	/* strings from sysfs write are not 0 terminated! */ -	if (count >= sizeof(override_name)) -		return -EINVAL; - -	/* strip of \n: */ -	if (buf[count-1] == '\n') -		count--; +	size_t ret;  	mutex_lock(&clocksource_mutex); -	if (count > 0) -		memcpy(override_name, buf, count); -	override_name[count] = 0; -	clocksource_select(); +	ret = sysfs_get_uname(buf, override_name, count); +	if (ret >= 0) +		clocksource_select();  	mutex_unlock(&clocksource_mutex); @@ -845,6 +938,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev,  }  /** + * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource + * @dev:	unused + * @attr:	unused + * @buf:	unused + * @count:	length of buffer + * + * Takes input from sysfs interface for manually unbinding a clocksource. + */ +static ssize_t sysfs_unbind_clocksource(struct device *dev, +					struct device_attribute *attr, +					const char *buf, size_t count) +{ +	struct clocksource *cs; +	char name[CS_NAME_LEN]; +	size_t ret; + +	ret = sysfs_get_uname(buf, name, count); +	if (ret < 0) +		return ret; + +	ret = -ENODEV; +	mutex_lock(&clocksource_mutex); +	list_for_each_entry(cs, &clocksource_list, list) { +		if (strcmp(cs->name, name)) +			continue; +		ret = clocksource_unbind(cs); +		break; +	} +	mutex_unlock(&clocksource_mutex); + +	return ret ? ret : count; +} + +/**   * sysfs_show_available_clocksources - sysfs interface for listing clocksource   * @dev:	unused   * @attr:	unused @@ -886,6 +1013,8 @@ sysfs_show_available_clocksources(struct device *dev,  static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,  		   sysfs_override_clocksource); +static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource); +  static DEVICE_ATTR(available_clocksource, 0444,  		   sysfs_show_available_clocksources, NULL); @@ -910,6 +1039,9 @@ static int __init init_clocksource_sysfs(void)  				&device_clocksource,  				&dev_attr_current_clocksource);  	if (!error) +		error = device_create_file(&device_clocksource, +					   &dev_attr_unbind_clocksource); +	if (!error)  		error = device_create_file(  				&device_clocksource,  				&dev_attr_available_clocksource); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c new file mode 100644 index 000000000000..a326f27d7f09 --- /dev/null +++ b/kernel/time/sched_clock.c @@ -0,0 +1,212 @@ +/* + * sched_clock.c: support for extending counters to full 64-bit ns counter + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/clocksource.h> +#include <linux/init.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/sched.h> +#include <linux/syscore_ops.h> +#include <linux/timer.h> +#include <linux/sched_clock.h> + +struct clock_data { +	u64 epoch_ns; +	u32 epoch_cyc; +	u32 epoch_cyc_copy; +	unsigned long rate; +	u32 mult; +	u32 shift; +	bool suspended; +}; + +static void sched_clock_poll(unsigned long wrap_ticks); +static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0); +static int irqtime = -1; + +core_param(irqtime, irqtime, int, 0400); + +static struct clock_data cd = { +	.mult	= NSEC_PER_SEC / HZ, +}; + +static u32 __read_mostly sched_clock_mask = 0xffffffff; + +static u32 notrace jiffy_sched_clock_read(void) +{ +	return (u32)(jiffies - INITIAL_JIFFIES); +} + +static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; + +static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) +{ +	return (cyc * mult) >> shift; +} + +static unsigned long long notrace sched_clock_32(void) +{ +	u64 epoch_ns; +	u32 epoch_cyc; +	u32 cyc; + +	if (cd.suspended) +		return cd.epoch_ns; + +	/* +	 * Load the epoch_cyc and epoch_ns atomically.  We do this by +	 * ensuring that we always write epoch_cyc, epoch_ns and +	 * epoch_cyc_copy in strict order, and read them in strict order. +	 * If epoch_cyc and epoch_cyc_copy are not equal, then we're in +	 * the middle of an update, and we should repeat the load. +	 */ +	do { +		epoch_cyc = cd.epoch_cyc; +		smp_rmb(); +		epoch_ns = cd.epoch_ns; +		smp_rmb(); +	} while (epoch_cyc != cd.epoch_cyc_copy); + +	cyc = read_sched_clock(); +	cyc = (cyc - epoch_cyc) & sched_clock_mask; +	return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); +} + +/* + * Atomically update the sched_clock epoch. + */ +static void notrace update_sched_clock(void) +{ +	unsigned long flags; +	u32 cyc; +	u64 ns; + +	cyc = read_sched_clock(); +	ns = cd.epoch_ns + +		cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, +			  cd.mult, cd.shift); +	/* +	 * Write epoch_cyc and epoch_ns in a way that the update is +	 * detectable in cyc_to_fixed_sched_clock(). +	 */ +	raw_local_irq_save(flags); +	cd.epoch_cyc_copy = cyc; +	smp_wmb(); +	cd.epoch_ns = ns; +	smp_wmb(); +	cd.epoch_cyc = cyc; +	raw_local_irq_restore(flags); +} + +static void sched_clock_poll(unsigned long wrap_ticks) +{ +	mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks)); +	update_sched_clock(); +} + +void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) +{ +	unsigned long r, w; +	u64 res, wrap; +	char r_unit; + +	if (cd.rate > rate) +		return; + +	BUG_ON(bits > 32); +	WARN_ON(!irqs_disabled()); +	read_sched_clock = read; +	sched_clock_mask = (1 << bits) - 1; +	cd.rate = rate; + +	/* calculate the mult/shift to convert counter ticks to ns. */ +	clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); + +	r = rate; +	if (r >= 4000000) { +		r /= 1000000; +		r_unit = 'M'; +	} else if (r >= 1000) { +		r /= 1000; +		r_unit = 'k'; +	} else +		r_unit = ' '; + +	/* calculate how many ns until we wrap */ +	wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); +	do_div(wrap, NSEC_PER_MSEC); +	w = wrap; + +	/* calculate the ns resolution of this counter */ +	res = cyc_to_ns(1ULL, cd.mult, cd.shift); +	pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", +		bits, r, r_unit, res, w); + +	/* +	 * Start the timer to keep sched_clock() properly updated and +	 * sets the initial epoch. +	 */ +	sched_clock_timer.data = msecs_to_jiffies(w - (w / 10)); +	update_sched_clock(); + +	/* +	 * Ensure that sched_clock() starts off at 0ns +	 */ +	cd.epoch_ns = 0; + +	/* Enable IRQ time accounting if we have a fast enough sched_clock */ +	if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) +		enable_sched_clock_irqtime(); + +	pr_debug("Registered %pF as sched_clock source\n", read); +} + +unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; + +unsigned long long notrace sched_clock(void) +{ +	return sched_clock_func(); +} + +void __init sched_clock_postinit(void) +{ +	/* +	 * If no sched_clock function has been provided at that point, +	 * make it the final one one. +	 */ +	if (read_sched_clock == jiffy_sched_clock_read) +		setup_sched_clock(jiffy_sched_clock_read, 32, HZ); + +	sched_clock_poll(sched_clock_timer.data); +} + +static int sched_clock_suspend(void) +{ +	sched_clock_poll(sched_clock_timer.data); +	cd.suspended = true; +	return 0; +} + +static void sched_clock_resume(void) +{ +	cd.epoch_cyc = read_sched_clock(); +	cd.epoch_cyc_copy = cd.epoch_cyc; +	cd.suspended = false; +} + +static struct syscore_ops sched_clock_ops = { +	.suspend = sched_clock_suspend, +	.resume = sched_clock_resume, +}; + +static int __init sched_clock_syscore_init(void) +{ +	register_syscore_ops(&sched_clock_ops); +	return 0; +} +device_initcall(sched_clock_syscore_init); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 20d6fba70652..218bcb565fed 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -19,6 +19,7 @@  #include <linux/profile.h>  #include <linux/sched.h>  #include <linux/smp.h> +#include <linux/module.h>  #include "tick-internal.h" @@ -29,6 +30,7 @@  static struct tick_device tick_broadcast_device;  static cpumask_var_t tick_broadcast_mask; +static cpumask_var_t tick_broadcast_on;  static cpumask_var_t tmpmask;  static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);  static int tick_broadcast_force; @@ -64,17 +66,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)  /*   * Check, if the device can be utilized as broadcast device:   */ -int tick_check_broadcast_device(struct clock_event_device *dev) +static bool tick_check_broadcast_device(struct clock_event_device *curdev, +					struct clock_event_device *newdev) +{ +	if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || +	    (newdev->features & CLOCK_EVT_FEAT_C3STOP)) +		return false; + +	if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT && +	    !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) +		return false; + +	return !curdev || newdev->rating > curdev->rating; +} + +/* + * Conditionally install/replace broadcast device + */ +void tick_install_broadcast_device(struct clock_event_device *dev)  {  	struct clock_event_device *cur = tick_broadcast_device.evtdev; -	if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || -	    (tick_broadcast_device.evtdev && -	     tick_broadcast_device.evtdev->rating >= dev->rating) || -	     (dev->features & CLOCK_EVT_FEAT_C3STOP)) -		return 0; +	if (!tick_check_broadcast_device(cur, dev)) +		return; + +	if (!try_module_get(dev->owner)) +		return; -	clockevents_exchange_device(tick_broadcast_device.evtdev, dev); +	clockevents_exchange_device(cur, dev);  	if (cur)  		cur->event_handler = clockevents_handle_noop;  	tick_broadcast_device.evtdev = dev; @@ -90,7 +109,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev)  	 */  	if (dev->features & CLOCK_EVT_FEAT_ONESHOT)  		tick_clock_notify(); -	return 1;  }  /* @@ -123,8 +141,9 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev)   */  int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)  { +	struct clock_event_device *bc = tick_broadcast_device.evtdev;  	unsigned long flags; -	int ret = 0; +	int ret;  	raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -138,20 +157,62 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)  		dev->event_handler = tick_handle_periodic;  		tick_device_setup_broadcast_func(dev);  		cpumask_set_cpu(cpu, tick_broadcast_mask); -		tick_broadcast_start_periodic(tick_broadcast_device.evtdev); +		if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) +			tick_broadcast_start_periodic(bc); +		else +			tick_broadcast_setup_oneshot(bc);  		ret = 1;  	} else {  		/* -		 * When the new device is not affected by the stop -		 * feature and the cpu is marked in the broadcast mask -		 * then clear the broadcast bit. +		 * Clear the broadcast bit for this cpu if the +		 * device is not power state affected.  		 */ -		if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { -			int cpu = smp_processor_id(); +		if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))  			cpumask_clear_cpu(cpu, tick_broadcast_mask); -			tick_broadcast_clear_oneshot(cpu); -		} else { +		else  			tick_device_setup_broadcast_func(dev); + +		/* +		 * Clear the broadcast bit if the CPU is not in +		 * periodic broadcast on state. +		 */ +		if (!cpumask_test_cpu(cpu, tick_broadcast_on)) +			cpumask_clear_cpu(cpu, tick_broadcast_mask); + +		switch (tick_broadcast_device.mode) { +		case TICKDEV_MODE_ONESHOT: +			/* +			 * If the system is in oneshot mode we can +			 * unconditionally clear the oneshot mask bit, +			 * because the CPU is running and therefore +			 * not in an idle state which causes the power +			 * state affected device to stop. Let the +			 * caller initialize the device. +			 */ +			tick_broadcast_clear_oneshot(cpu); +			ret = 0; +			break; + +		case TICKDEV_MODE_PERIODIC: +			/* +			 * If the system is in periodic mode, check +			 * whether the broadcast device can be +			 * switched off now. +			 */ +			if (cpumask_empty(tick_broadcast_mask) && bc) +				clockevents_shutdown(bc); +			/* +			 * If we kept the cpu in the broadcast mask, +			 * tell the caller to leave the per cpu device +			 * in shutdown state. The periodic interrupt +			 * is delivered by the broadcast device. +			 */ +			ret = cpumask_test_cpu(cpu, tick_broadcast_mask); +			break; +		default: +			/* Nothing to do */ +			ret = 0; +			break;  		}  	}  	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -281,6 +342,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)  	switch (*reason) {  	case CLOCK_EVT_NOTIFY_BROADCAST_ON:  	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: +		cpumask_set_cpu(cpu, tick_broadcast_on);  		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {  			if (tick_broadcast_device.mode ==  			    TICKDEV_MODE_PERIODIC) @@ -290,8 +352,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)  			tick_broadcast_force = 1;  		break;  	case CLOCK_EVT_NOTIFY_BROADCAST_OFF: -		if (!tick_broadcast_force && -		    cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { +		if (tick_broadcast_force) +			break; +		cpumask_clear_cpu(cpu, tick_broadcast_on); +		if (!tick_device_is_functional(dev)) +			break; +		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {  			if (tick_broadcast_device.mode ==  			    TICKDEV_MODE_PERIODIC)  				tick_setup_periodic(dev, 0); @@ -349,6 +415,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)  	bc = tick_broadcast_device.evtdev;  	cpumask_clear_cpu(cpu, tick_broadcast_mask); +	cpumask_clear_cpu(cpu, tick_broadcast_on);  	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {  		if (bc && cpumask_empty(tick_broadcast_mask)) @@ -475,7 +542,15 @@ void tick_check_oneshot_broadcast(int cpu)  	if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {  		struct tick_device *td = &per_cpu(tick_cpu_device, cpu); -		clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); +		/* +		 * We might be in the middle of switching over from +		 * periodic to oneshot. If the CPU has not yet +		 * switched over, leave the device alone. +		 */ +		if (td->mode == TICKDEV_MODE_ONESHOT) { +			clockevents_set_mode(td->evtdev, +					     CLOCK_EVT_MODE_ONESHOT); +		}  	}  } @@ -522,6 +597,13 @@ again:  	cpumask_clear(tick_broadcast_force_mask);  	/* +	 * Sanity check. Catch the case where we try to broadcast to +	 * offline cpus. +	 */ +	if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) +		cpumask_and(tmpmask, tmpmask, cpu_online_mask); + +	/*  	 * Wakeup the cpus which have an expired event.  	 */  	tick_do_broadcast(tmpmask); @@ -761,10 +843,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)  	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);  	/* -	 * Clear the broadcast mask flag for the dead cpu, but do not -	 * stop the broadcast device! +	 * Clear the broadcast masks for the dead cpu, but do not stop +	 * the broadcast device!  	 */  	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); +	cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); +	cpumask_clear_cpu(cpu, tick_broadcast_force_mask);  	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);  } @@ -792,6 +876,7 @@ bool tick_broadcast_oneshot_available(void)  void __init tick_broadcast_init(void)  {  	zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); +	zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);  	zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);  #ifdef CONFIG_TICK_ONESHOT  	zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5d3fb100bc06..64522ecdfe0e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,6 +18,7 @@  #include <linux/percpu.h>  #include <linux/profile.h>  #include <linux/sched.h> +#include <linux/module.h>  #include <asm/irq_regs.h> @@ -33,7 +34,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);  ktime_t tick_next_period;  ktime_t tick_period;  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; -static DEFINE_RAW_SPINLOCK(tick_device_lock);  /*   * Debugging: see timer_list.c @@ -194,7 +194,8 @@ static void tick_setup_device(struct tick_device *td,  	 * When global broadcasting is active, check if the current  	 * device is registered as a placeholder for broadcast mode.  	 * This allows us to handle this x86 misfeature in a generic -	 * way. +	 * way. This function also returns !=0 when we keep the +	 * current active broadcast state for this CPU.  	 */  	if (tick_device_uses_broadcast(newdev, cpu))  		return; @@ -205,17 +206,75 @@ static void tick_setup_device(struct tick_device *td,  		tick_setup_oneshot(newdev, handler, next_event);  } +void tick_install_replacement(struct clock_event_device *newdev) +{ +	struct tick_device *td = &__get_cpu_var(tick_cpu_device); +	int cpu = smp_processor_id(); + +	clockevents_exchange_device(td->evtdev, newdev); +	tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); +	if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) +		tick_oneshot_notify(); +} + +static bool tick_check_percpu(struct clock_event_device *curdev, +			      struct clock_event_device *newdev, int cpu) +{ +	if (!cpumask_test_cpu(cpu, newdev->cpumask)) +		return false; +	if (cpumask_equal(newdev->cpumask, cpumask_of(cpu))) +		return true; +	/* Check if irq affinity can be set */ +	if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq)) +		return false; +	/* Prefer an existing cpu local device */ +	if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) +		return false; +	return true; +} + +static bool tick_check_preferred(struct clock_event_device *curdev, +				 struct clock_event_device *newdev) +{ +	/* Prefer oneshot capable device */ +	if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) { +		if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT)) +			return false; +		if (tick_oneshot_mode_active()) +			return false; +	} + +	/* +	 * Use the higher rated one, but prefer a CPU local device with a lower +	 * rating than a non-CPU local device +	 */ +	return !curdev || +		newdev->rating > curdev->rating || +	       !cpumask_equal(curdev->cpumask, newdev->cpumask); +} + +/* + * Check whether the new device is a better fit than curdev. curdev + * can be NULL ! + */ +bool tick_check_replacement(struct clock_event_device *curdev, +			    struct clock_event_device *newdev) +{ +	if (tick_check_percpu(curdev, newdev, smp_processor_id())) +		return false; + +	return tick_check_preferred(curdev, newdev); +} +  /* - * Check, if the new registered device should be used. + * Check, if the new registered device should be used. Called with + * clockevents_lock held and interrupts disabled.   */ -static int tick_check_new_device(struct clock_event_device *newdev) +void tick_check_new_device(struct clock_event_device *newdev)  {  	struct clock_event_device *curdev;  	struct tick_device *td; -	int cpu, ret = NOTIFY_OK; -	unsigned long flags; - -	raw_spin_lock_irqsave(&tick_device_lock, flags); +	int cpu;  	cpu = smp_processor_id();  	if (!cpumask_test_cpu(cpu, newdev->cpumask)) @@ -225,40 +284,15 @@ static int tick_check_new_device(struct clock_event_device *newdev)  	curdev = td->evtdev;  	/* cpu local device ? */ -	if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { - -		/* -		 * If the cpu affinity of the device interrupt can not -		 * be set, ignore it. -		 */ -		if (!irq_can_set_affinity(newdev->irq)) -			goto out_bc; +	if (!tick_check_percpu(curdev, newdev, cpu)) +		goto out_bc; -		/* -		 * If we have a cpu local device already, do not replace it -		 * by a non cpu local device -		 */ -		if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) -			goto out_bc; -	} +	/* Preference decision */ +	if (!tick_check_preferred(curdev, newdev)) +		goto out_bc; -	/* -	 * If we have an active device, then check the rating and the oneshot -	 * feature. -	 */ -	if (curdev) { -		/* -		 * Prefer one shot capable devices ! -		 */ -		if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && -		    !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) -			goto out_bc; -		/* -		 * Check the rating -		 */ -		if (curdev->rating >= newdev->rating) -			goto out_bc; -	} +	if (!try_module_get(newdev->owner)) +		return;  	/*  	 * Replace the eventually existing device by the new @@ -273,20 +307,13 @@ static int tick_check_new_device(struct clock_event_device *newdev)  	tick_setup_device(td, newdev, cpu, cpumask_of(cpu));  	if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)  		tick_oneshot_notify(); - -	raw_spin_unlock_irqrestore(&tick_device_lock, flags); -	return NOTIFY_STOP; +	return;  out_bc:  	/*  	 * Can the new device be used as a broadcast device ?  	 */ -	if (tick_check_broadcast_device(newdev)) -		ret = NOTIFY_STOP; - -	raw_spin_unlock_irqrestore(&tick_device_lock, flags); - -	return ret; +	tick_install_broadcast_device(newdev);  }  /* @@ -294,7 +321,7 @@ out_bc:   *   * Called with interrupts disabled.   */ -static void tick_handover_do_timer(int *cpup) +void tick_handover_do_timer(int *cpup)  {  	if (*cpup == tick_do_timer_cpu) {  		int cpu = cpumask_first(cpu_online_mask); @@ -311,13 +338,11 @@ static void tick_handover_do_timer(int *cpup)   * access the hardware device itself.   * We just set the mode and remove it from the lists.   */ -static void tick_shutdown(unsigned int *cpup) +void tick_shutdown(unsigned int *cpup)  {  	struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);  	struct clock_event_device *dev = td->evtdev; -	unsigned long flags; -	raw_spin_lock_irqsave(&tick_device_lock, flags);  	td->mode = TICKDEV_MODE_PERIODIC;  	if (dev) {  		/* @@ -329,26 +354,20 @@ static void tick_shutdown(unsigned int *cpup)  		dev->event_handler = clockevents_handle_noop;  		td->evtdev = NULL;  	} -	raw_spin_unlock_irqrestore(&tick_device_lock, flags);  } -static void tick_suspend(void) +void tick_suspend(void)  {  	struct tick_device *td = &__get_cpu_var(tick_cpu_device); -	unsigned long flags; -	raw_spin_lock_irqsave(&tick_device_lock, flags);  	clockevents_shutdown(td->evtdev); -	raw_spin_unlock_irqrestore(&tick_device_lock, flags);  } -static void tick_resume(void) +void tick_resume(void)  {  	struct tick_device *td = &__get_cpu_var(tick_cpu_device); -	unsigned long flags;  	int broadcast = tick_resume_broadcast(); -	raw_spin_lock_irqsave(&tick_device_lock, flags);  	clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);  	if (!broadcast) { @@ -357,68 +376,12 @@ static void tick_resume(void)  		else  			tick_resume_oneshot();  	} -	raw_spin_unlock_irqrestore(&tick_device_lock, flags);  } -/* - * Notification about clock event devices - */ -static int tick_notify(struct notifier_block *nb, unsigned long reason, -			       void *dev) -{ -	switch (reason) { - -	case CLOCK_EVT_NOTIFY_ADD: -		return tick_check_new_device(dev); - -	case CLOCK_EVT_NOTIFY_BROADCAST_ON: -	case CLOCK_EVT_NOTIFY_BROADCAST_OFF: -	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: -		tick_broadcast_on_off(reason, dev); -		break; - -	case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: -	case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: -		tick_broadcast_oneshot_control(reason); -		break; - -	case CLOCK_EVT_NOTIFY_CPU_DYING: -		tick_handover_do_timer(dev); -		break; - -	case CLOCK_EVT_NOTIFY_CPU_DEAD: -		tick_shutdown_broadcast_oneshot(dev); -		tick_shutdown_broadcast(dev); -		tick_shutdown(dev); -		break; - -	case CLOCK_EVT_NOTIFY_SUSPEND: -		tick_suspend(); -		tick_suspend_broadcast(); -		break; - -	case CLOCK_EVT_NOTIFY_RESUME: -		tick_resume(); -		break; - -	default: -		break; -	} - -	return NOTIFY_OK; -} - -static struct notifier_block tick_notifier = { -	.notifier_call = tick_notify, -}; -  /**   * tick_init - initialize the tick control - * - * Register the notifier with the clockevents framework   */  void __init tick_init(void)  { -	clockevents_register_notifier(&tick_notifier);  	tick_broadcast_init();  } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f0299eae4602..bc906cad709b 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -6,6 +6,8 @@  extern seqlock_t jiffies_lock; +#define CS_NAME_LEN	32 +  #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD  #define TICK_DO_TIMER_NONE	-1 @@ -18,9 +20,19 @@ extern int tick_do_timer_cpu __read_mostly;  extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);  extern void tick_handle_periodic(struct clock_event_device *dev); +extern void tick_check_new_device(struct clock_event_device *dev); +extern void tick_handover_do_timer(int *cpup); +extern void tick_shutdown(unsigned int *cpup); +extern void tick_suspend(void); +extern void tick_resume(void); +extern bool tick_check_replacement(struct clock_event_device *curdev, +				   struct clock_event_device *newdev); +extern void tick_install_replacement(struct clock_event_device *dev);  extern void clockevents_shutdown(struct clock_event_device *dev); +extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); +  /*   * NO_HZ / high resolution timer shared code   */ @@ -90,7 +102,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; }   */  #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST  extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); -extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern void tick_install_broadcast_device(struct clock_event_device *dev);  extern int tick_is_broadcast_device(struct clock_event_device *dev);  extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);  extern void tick_shutdown_broadcast(unsigned int *cpup); @@ -102,9 +114,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);  #else /* !BROADCAST */ -static inline int tick_check_broadcast_device(struct clock_event_device *dev) +static inline void tick_install_broadcast_device(struct clock_event_device *dev)  { -	return 0;  }  static inline int tick_is_broadcast_device(struct clock_event_device *dev) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0cf1c1453181..e77edc97e036 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -178,6 +178,11 @@ static bool can_stop_full_tick(void)  	 */  	if (!sched_clock_stable) {  		trace_tick_stop(0, "unstable sched clock\n"); +		/* +		 * Don't allow the user to think they can get +		 * full NO_HZ with this machine. +		 */ +		WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock");  		return false;  	}  #endif @@ -293,7 +298,7 @@ static int __init tick_nohz_full_setup(char *str)  }  __setup("nohz_full=", tick_nohz_full_setup); -static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, +static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,  						 unsigned long action,  						 void *hcpu)  { @@ -346,16 +351,6 @@ void __init tick_nohz_init(void)  	}  	cpu_notifier(tick_nohz_cpu_down_callback, 0); - -	/* Make sure full dynticks CPU are also RCU nocbs */ -	for_each_cpu(cpu, nohz_full_mask) { -		if (!rcu_is_nocb_cpu(cpu)) { -			pr_warning("NO_HZ: CPU %d is not RCU nocb: " -				   "cleared from nohz_full range", cpu); -			cpumask_clear_cpu(cpu, nohz_full_mask); -		} -	} -  	cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);  	pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);  } @@ -832,13 +827,10 @@ void tick_nohz_irq_exit(void)  {  	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); -	if (ts->inidle) { -		/* Cancel the timer because CPU already waken up from the C-states*/ -		menu_hrtimer_cancel(); +	if (ts->inidle)  		__tick_nohz_idle_enter(ts); -	} else { +	else  		tick_nohz_full_stop_tick(ts); -	}  }  /** @@ -936,8 +928,6 @@ void tick_nohz_idle_exit(void)  	ts->inidle = 0; -	/* Cancel the timer because CPU already waken up from the C-states*/ -	menu_hrtimer_cancel();  	if (ts->idle_active || ts->tick_stopped)  		now = ktime_get(); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index baeeb5c87cf1..48b9fffabdc2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,6 +25,11 @@  #include "tick-internal.h"  #include "ntp_internal.h" +#include "timekeeping_internal.h" + +#define TK_CLEAR_NTP		(1 << 0) +#define TK_MIRROR		(1 << 1) +#define TK_CLOCK_WAS_SET	(1 << 2)  static struct timekeeper timekeeper;  static DEFINE_RAW_SPINLOCK(timekeeper_lock); @@ -200,9 +205,9 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)  static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); -static void update_pvclock_gtod(struct timekeeper *tk) +static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)  { -	raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); +	raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);  }  /** @@ -216,7 +221,7 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)  	raw_spin_lock_irqsave(&timekeeper_lock, flags);  	ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); -	update_pvclock_gtod(tk); +	update_pvclock_gtod(tk, true);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	return ret; @@ -241,16 +246,16 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)  EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);  /* must hold timekeeper_lock */ -static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) +static void timekeeping_update(struct timekeeper *tk, unsigned int action)  { -	if (clearntp) { +	if (action & TK_CLEAR_NTP) {  		tk->ntp_error = 0;  		ntp_clear();  	}  	update_vsyscall(tk); -	update_pvclock_gtod(tk); +	update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); -	if (mirror) +	if (action & TK_MIRROR)  		memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));  } @@ -508,7 +513,7 @@ int do_settimeofday(const struct timespec *tv)  	tk_set_xtime(tk, tv); -	timekeeping_update(tk, true, true); +	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);  	write_seqcount_end(&timekeeper_seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -552,7 +557,7 @@ int timekeeping_inject_offset(struct timespec *ts)  	tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));  error: /* even if we error out, we forwarded the time, so call update */ -	timekeeping_update(tk, true, true); +	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);  	write_seqcount_end(&timekeeper_seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -627,13 +632,22 @@ static int change_clocksource(void *data)  	write_seqcount_begin(&timekeeper_seq);  	timekeeping_forward_now(tk); -	if (!new->enable || new->enable(new) == 0) { -		old = tk->clock; -		tk_setup_internals(tk, new); -		if (old->disable) -			old->disable(old); +	/* +	 * If the cs is in module, get a module reference. Succeeds +	 * for built-in code (owner == NULL) as well. +	 */ +	if (try_module_get(new->owner)) { +		if (!new->enable || new->enable(new) == 0) { +			old = tk->clock; +			tk_setup_internals(tk, new); +			if (old->disable) +				old->disable(old); +			module_put(old->owner); +		} else { +			module_put(new->owner); +		}  	} -	timekeeping_update(tk, true, true); +	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);  	write_seqcount_end(&timekeeper_seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -648,14 +662,15 @@ static int change_clocksource(void *data)   * This function is called from clocksource.c after a new, better clock   * source has been registered. The caller holds the clocksource_mutex.   */ -void timekeeping_notify(struct clocksource *clock) +int timekeeping_notify(struct clocksource *clock)  {  	struct timekeeper *tk = &timekeeper;  	if (tk->clock == clock) -		return; +		return 0;  	stop_machine(change_clocksource, clock, NULL);  	tick_clock_notify(); +	return tk->clock == clock ? 0 : -1;  }  /** @@ -841,6 +856,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,  	tk_xtime_add(tk, delta);  	tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));  	tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); +	tk_debug_account_sleep_time(delta);  }  /** @@ -872,7 +888,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)  	__timekeeping_inject_sleeptime(tk, delta); -	timekeeping_update(tk, true, true); +	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);  	write_seqcount_end(&timekeeper_seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -954,7 +970,7 @@ static void timekeeping_resume(void)  	tk->cycle_last = clock->cycle_last = cycle_now;  	tk->ntp_error = 0;  	timekeeping_suspended = 0; -	timekeeping_update(tk, false, true); +	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);  	write_seqcount_end(&timekeeper_seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1236,9 +1252,10 @@ out_adjust:   * It also calls into the NTP code to handle leapsecond processing.   *   */ -static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) +static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)  {  	u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; +	unsigned int action = 0;  	while (tk->xtime_nsec >= nsecps) {  		int leap; @@ -1261,8 +1278,10 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)  			__timekeeping_set_tai_offset(tk, tk->tai_offset - leap);  			clock_was_set_delayed(); +			action = TK_CLOCK_WAS_SET;  		}  	} +	return action;  }  /** @@ -1347,6 +1366,7 @@ static void update_wall_time(void)  	struct timekeeper *tk = &shadow_timekeeper;  	cycle_t offset;  	int shift = 0, maxshift; +	unsigned int action;  	unsigned long flags;  	raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1399,7 +1419,7 @@ static void update_wall_time(void)  	 * Finally, make sure that after the rounding  	 * xtime_nsec isn't larger than NSEC_PER_SEC  	 */ -	accumulate_nsecs_to_secs(tk); +	action = accumulate_nsecs_to_secs(tk);  	write_seqcount_begin(&timekeeper_seq);  	/* Update clock->cycle_last with the new value */ @@ -1415,7 +1435,7 @@ static void update_wall_time(void)  	 * updating.  	 */  	memcpy(real_tk, tk, sizeof(*tk)); -	timekeeping_update(real_tk, false, false); +	timekeeping_update(real_tk, action);  	write_seqcount_end(&timekeeper_seq);  out:  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1677,6 +1697,7 @@ int do_adjtimex(struct timex *txc)  	if (tai != orig_tai) {  		__timekeeping_set_tai_offset(tk, tai); +		update_pvclock_gtod(tk, true);  		clock_was_set_delayed();  	}  	write_seqcount_end(&timekeeper_seq); diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c new file mode 100644 index 000000000000..802433a4f5eb --- /dev/null +++ b/kernel/time/timekeeping_debug.c @@ -0,0 +1,72 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + */ + +#include <linux/debugfs.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/time.h> + +static unsigned int sleep_time_bin[32] = {0}; + +static int tk_debug_show_sleep_time(struct seq_file *s, void *data) +{ +	unsigned int bin; +	seq_puts(s, "      time (secs)        count\n"); +	seq_puts(s, "------------------------------\n"); +	for (bin = 0; bin < 32; bin++) { +		if (sleep_time_bin[bin] == 0) +			continue; +		seq_printf(s, "%10u - %-10u %4u\n", +			bin ? 1 << (bin - 1) : 0, 1 << bin, +				sleep_time_bin[bin]); +	} +	return 0; +} + +static int tk_debug_sleep_time_open(struct inode *inode, struct file *file) +{ +	return single_open(file, tk_debug_show_sleep_time, NULL); +} + +static const struct file_operations tk_debug_sleep_time_fops = { +	.open		= tk_debug_sleep_time_open, +	.read		= seq_read, +	.llseek		= seq_lseek, +	.release	= single_release, +}; + +static int __init tk_debug_sleep_time_init(void) +{ +	struct dentry *d; + +	d = debugfs_create_file("sleep_time", 0444, NULL, NULL, +		&tk_debug_sleep_time_fops); +	if (!d) { +		pr_err("Failed to create sleep_time debug file\n"); +		return -ENOMEM; +	} + +	return 0; +} +late_initcall(tk_debug_sleep_time_init); + +void tk_debug_account_sleep_time(struct timespec *t) +{ +	sleep_time_bin[fls(t->tv_sec)]++; +} + diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h new file mode 100644 index 000000000000..13323ea08ffa --- /dev/null +++ b/kernel/time/timekeeping_internal.h @@ -0,0 +1,14 @@ +#ifndef _TIMEKEEPING_INTERNAL_H +#define _TIMEKEEPING_INTERNAL_H +/* + * timekeeping debug functions + */ +#include <linux/time.h> + +#ifdef CONFIG_DEBUG_FS +extern void tk_debug_account_sleep_time(struct timespec *t); +#else +#define tk_debug_account_sleep_time(x) +#endif + +#endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/timer.c b/kernel/timer.c index 15ffdb3f1948..4296d13db3d1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -149,9 +149,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,  	/* now that we have rounded, subtract the extra skew again */  	j -= cpu * 3; -	if (j <= jiffies) /* rounding ate our timeout entirely; */ -		return original; -	return j; +	/* +	 * Make sure j is still in the future. Otherwise return the +	 * unmodified value. +	 */ +	return time_is_after_jiffies(j) ? j : original;  }  /** @@ -1503,11 +1505,11 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)  }  EXPORT_SYMBOL(schedule_timeout_uninterruptible); -static int __cpuinit init_timers_cpu(int cpu) +static int init_timers_cpu(int cpu)  {  	int j;  	struct tvec_base *base; -	static char __cpuinitdata tvec_base_done[NR_CPUS]; +	static char tvec_base_done[NR_CPUS];  	if (!tvec_base_done[cpu]) {  		static char boot_done; @@ -1575,7 +1577,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea  	}  } -static void __cpuinit migrate_timers(int cpu) +static void migrate_timers(int cpu)  {  	struct tvec_base *old_base;  	struct tvec_base *new_base; @@ -1608,7 +1610,7 @@ static void __cpuinit migrate_timers(int cpu)  }  #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit timer_cpu_notify(struct notifier_block *self, +static int timer_cpu_notify(struct notifier_block *self,  				unsigned long action, void *hcpu)  {  	long cpu = (long)hcpu; @@ -1633,7 +1635,7 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,  	return NOTIFY_OK;  } -static struct notifier_block __cpuinitdata timers_nb = { +static struct notifier_block timers_nb = {  	.notifier_call	= timer_cpu_notify,  }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6c508ff33c62..a6d098c6df3f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -413,6 +413,17 @@ static int __register_ftrace_function(struct ftrace_ops *ops)  	return 0;  } +static void ftrace_sync(struct work_struct *work) +{ +	/* +	 * This function is just a stub to implement a hard force +	 * of synchronize_sched(). This requires synchronizing +	 * tasks even in userspace and idle. +	 * +	 * Yes, function tracing is rude. +	 */ +} +  static int __unregister_ftrace_function(struct ftrace_ops *ops)  {  	int ret; @@ -440,8 +451,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)  			 * so there'll be no new users. We must ensure  			 * all current users are done before we free  			 * the control data. +			 * Note synchronize_sched() is not enough, as we +			 * use preempt_disable() to do RCU, but the function +			 * tracer can be called where RCU is not active +			 * (before user_exit()).  			 */ -			synchronize_sched(); +			schedule_on_each_cpu(ftrace_sync);  			control_ops_free(ops);  		}  	} else @@ -456,9 +471,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)  	/*  	 * Dynamic ops may be freed, we must make sure that all  	 * callers are done before leaving this function. +	 * +	 * Again, normal synchronize_sched() is not good enough. +	 * We need to do a hard force of sched synchronization.  	 */  	if (ops->flags & FTRACE_OPS_FL_DYNAMIC) -		synchronize_sched(); +		schedule_on_each_cpu(ftrace_sync); +  	return 0;  } @@ -622,12 +641,18 @@ static int function_stat_show(struct seq_file *m, void *v)  	if (rec->counter <= 1)  		stddev = 0;  	else { -		stddev = rec->time_squared - rec->counter * avg * avg; +		/* +		 * Apply Welford's method: +		 * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2) +		 */ +		stddev = rec->counter * rec->time_squared - +			 rec->time * rec->time; +  		/*  		 * Divide only 1000 for ns^2 -> us^2 conversion.  		 * trace_print_graph_duration will divide 1000 again.  		 */ -		do_div(stddev, (rec->counter - 1) * 1000); +		do_div(stddev, rec->counter * (rec->counter - 1) * 1000);  	}  	trace_seq_init(&s); @@ -1416,12 +1441,22 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,   * the hashes are freed with call_rcu_sched().   */  static int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)  {  	struct ftrace_hash *filter_hash;  	struct ftrace_hash *notrace_hash;  	int ret; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS +	/* +	 * There's a small race when adding ops that the ftrace handler +	 * that wants regs, may be called without them. We can not +	 * allow that handler to be called if regs is NULL. +	 */ +	if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS)) +		return 0; +#endif +  	filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);  	notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); @@ -2134,12 +2169,57 @@ static cycle_t		ftrace_update_time;  static unsigned long	ftrace_update_cnt;  unsigned long		ftrace_update_tot_cnt; -static int ops_traces_mod(struct ftrace_ops *ops) +static inline int ops_traces_mod(struct ftrace_ops *ops)  { -	struct ftrace_hash *hash; +	/* +	 * Filter_hash being empty will default to trace module. +	 * But notrace hash requires a test of individual module functions. +	 */ +	return ftrace_hash_empty(ops->filter_hash) && +		ftrace_hash_empty(ops->notrace_hash); +} + +/* + * Check if the current ops references the record. + * + * If the ops traces all functions, then it was already accounted for. + * If the ops does not trace the current record function, skip it. + * If the ops ignores the function via notrace filter, skip it. + */ +static inline bool +ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) +{ +	/* If ops isn't enabled, ignore it */ +	if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) +		return 0; -	hash = ops->filter_hash; -	return ftrace_hash_empty(hash); +	/* If ops traces all mods, we already accounted for it */ +	if (ops_traces_mod(ops)) +		return 0; + +	/* The function must be in the filter */ +	if (!ftrace_hash_empty(ops->filter_hash) && +	    !ftrace_lookup_ip(ops->filter_hash, rec->ip)) +		return 0; + +	/* If in notrace hash, we ignore it too */ +	if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) +		return 0; + +	return 1; +} + +static int referenced_filters(struct dyn_ftrace *rec) +{ +	struct ftrace_ops *ops; +	int cnt = 0; + +	for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { +		if (ops_references_rec(ops, rec)) +		    cnt++; +	} + +	return cnt;  }  static int ftrace_update_code(struct module *mod) @@ -2148,6 +2228,7 @@ static int ftrace_update_code(struct module *mod)  	struct dyn_ftrace *p;  	cycle_t start, stop;  	unsigned long ref = 0; +	bool test = false;  	int i;  	/* @@ -2161,9 +2242,12 @@ static int ftrace_update_code(struct module *mod)  		for (ops = ftrace_ops_list;  		     ops != &ftrace_list_end; ops = ops->next) { -			if (ops->flags & FTRACE_OPS_FL_ENABLED && -			    ops_traces_mod(ops)) -				ref++; +			if (ops->flags & FTRACE_OPS_FL_ENABLED) { +				if (ops_traces_mod(ops)) +					ref++; +				else +					test = true; +			}  		}  	} @@ -2173,12 +2257,16 @@ static int ftrace_update_code(struct module *mod)  	for (pg = ftrace_new_pgs; pg; pg = pg->next) {  		for (i = 0; i < pg->index; i++) { +			int cnt = ref; +  			/* If something went wrong, bail without enabling anything */  			if (unlikely(ftrace_disabled))  				return -1;  			p = &pg->records[i]; -			p->flags = ref; +			if (test) +				cnt += referenced_filters(p); +			p->flags = cnt;  			/*  			 * Do the initial record conversion from mcount jump @@ -2198,7 +2286,7 @@ static int ftrace_update_code(struct module *mod)  			 * conversion puts the module to the correct state, thus  			 * passing the ftrace_make_call check.  			 */ -			if (ftrace_start_up && ref) { +			if (ftrace_start_up && cnt) {  				int failed = __ftrace_replace_code(p, 1);  				if (failed)  					ftrace_bug(failed, p->ip); @@ -3349,6 +3437,12 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)  	return add_hash_entry(hash, ip);  } +static void ftrace_ops_update_code(struct ftrace_ops *ops) +{ +	if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) +		ftrace_run_update_code(FTRACE_UPDATE_CALLS); +} +  static int  ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,  		unsigned long ip, int remove, int reset, int enable) @@ -3391,9 +3485,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,  	mutex_lock(&ftrace_lock);  	ret = ftrace_hash_move(ops, enable, orig_hash, hash); -	if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED -	    && ftrace_enabled) -		ftrace_run_update_code(FTRACE_UPDATE_CALLS); +	if (!ret) +		ftrace_ops_update_code(ops);  	mutex_unlock(&ftrace_lock); @@ -3512,8 +3605,12 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);  static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;  static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; +/* Used by function selftest to not test if filter is set */ +bool ftrace_filter_param __initdata; +  static int __init set_ftrace_notrace(char *str)  { +	ftrace_filter_param = true;  	strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);  	return 1;  } @@ -3521,6 +3618,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace);  static int __init set_ftrace_filter(char *str)  { +	ftrace_filter_param = true;  	strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);  	return 1;  } @@ -3615,9 +3713,8 @@ int ftrace_regex_release(struct inode *inode, struct file *file)  		mutex_lock(&ftrace_lock);  		ret = ftrace_hash_move(iter->ops, filter_hash,  				       orig_hash, iter->hash); -		if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) -		    && ftrace_enabled) -			ftrace_run_update_code(FTRACE_UPDATE_CALLS); +		if (!ret) +			ftrace_ops_update_code(iter->ops);  		mutex_unlock(&ftrace_lock);  	} @@ -4188,7 +4285,7 @@ static inline void ftrace_startup_enable(int command) { }  # define ftrace_shutdown_sysctl()	do { } while (0)  static inline int -ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)  {  	return 1;  } @@ -4211,7 +4308,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,  	do_for_each_ftrace_op(op, ftrace_control_list) {  		if (!(op->flags & FTRACE_OPS_FL_STUB) &&  		    !ftrace_function_local_disabled(op) && -		    ftrace_ops_test(op, ip)) +		    ftrace_ops_test(op, ip, regs))  			op->func(ip, parent_ip, op, regs);  	} while_for_each_ftrace_op(op);  	trace_recursion_clear(TRACE_CONTROL_BIT); @@ -4244,7 +4341,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,  	 */  	preempt_disable_notrace();  	do_for_each_ftrace_op(op, ftrace_ops_list) { -		if (ftrace_ops_test(op, ip)) +		if (ftrace_ops_test(op, ip, regs))  			op->func(ip, parent_ip, op, regs);  	} while_for_each_ftrace_op(op);  	preempt_enable_notrace(); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e444ff88f0a4..cc2f66f68dc5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -36,11 +36,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s)  {  	int ret; -	ret = trace_seq_printf(s, "# compressed entry header\n"); -	ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n"); -	ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n"); -	ret = trace_seq_printf(s, "\tarray       :   32 bits\n"); -	ret = trace_seq_printf(s, "\n"); +	ret = trace_seq_puts(s, "# compressed entry header\n"); +	ret = trace_seq_puts(s, "\ttype_len    :    5 bits\n"); +	ret = trace_seq_puts(s, "\ttime_delta  :   27 bits\n"); +	ret = trace_seq_puts(s, "\tarray       :   32 bits\n"); +	ret = trace_seq_putc(s, '\n');  	ret = trace_seq_printf(s, "\tpadding     : type == %d\n",  			       RINGBUF_TYPE_PADDING);  	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", @@ -1066,7 +1066,7 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,  }  /** - * check_pages - integrity check of buffer pages + * rb_check_pages - integrity check of buffer pages   * @cpu_buffer: CPU buffer with pages to test   *   * As a safety measure we check to make sure the data pages have not @@ -1258,7 +1258,7 @@ static int rb_cpu_notify(struct notifier_block *self,  #endif  /** - * ring_buffer_alloc - allocate a new ring_buffer + * __ring_buffer_alloc - allocate a new ring_buffer   * @size: the size in bytes per cpu that is needed.   * @flags: attributes to set for the ring buffer.   * @@ -1607,6 +1607,7 @@ static void update_pages_handler(struct work_struct *work)   * ring_buffer_resize - resize the ring buffer   * @buffer: the buffer to resize.   * @size: the new size. + * @cpu_id: the cpu buffer to resize   *   * Minimum size is 2 * BUF_PAGE_SIZE.   * @@ -3956,11 +3957,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);   * expected.   *   * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_prepare_sync. + * expected to make at least one call to ring_buffer_read_prepare_sync.   * Afterwards, ring_buffer_read_start is invoked to get things going   * for real.   * - * This overall must be paired with ring_buffer_finish. + * This overall must be paired with ring_buffer_read_finish.   */  struct ring_buffer_iter *  ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) @@ -4009,7 +4010,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);   * an intervening ring_buffer_read_prepare_sync must have been   * performed.   * - * Must be paired with ring_buffer_finish. + * Must be paired with ring_buffer_read_finish.   */  void  ring_buffer_read_start(struct ring_buffer_iter *iter) @@ -4031,7 +4032,7 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)  EXPORT_SYMBOL_GPL(ring_buffer_read_start);  /** - * ring_buffer_finish - finish reading the iterator of the buffer + * ring_buffer_read_finish - finish reading the iterator of the buffer   * @iter: The iterator retrieved by ring_buffer_start   *   * This re-enables the recording to the buffer, and frees the @@ -4346,6 +4347,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);  /**   * ring_buffer_alloc_read_page - allocate a page to read from buffer   * @buffer: the buffer to allocate for. + * @cpu: the cpu buffer to allocate.   *   * This function is used in conjunction with ring_buffer_read_page.   * When reading a full page from the ring buffer, these functions @@ -4403,7 +4405,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);   * to swap with a page in the ring buffer.   *   * for example: - *	rpage = ring_buffer_alloc_read_page(buffer); + *	rpage = ring_buffer_alloc_read_page(buffer, cpu);   *	if (!rpage)   *		return error;   *	ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e71a8be4a6ee..496f94d57698 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -115,6 +115,9 @@ cpumask_var_t __read_mostly	tracing_buffer_mask;  enum ftrace_dump_mode ftrace_dump_on_oops; +/* When set, tracing will stop when a WARN*() is hit */ +int __disable_trace_on_warning; +  static int tracing_set_tracer(const char *buf);  #define MAX_TRACER_SIZE		100 @@ -149,6 +152,13 @@ static int __init set_ftrace_dump_on_oops(char *str)  }  __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); +static int __init stop_trace_on_warning(char *str) +{ +	__disable_trace_on_warning = 1; +	return 1; +} +__setup("traceoff_on_warning=", stop_trace_on_warning); +  static int __init boot_alloc_snapshot(char *str)  {  	allocate_snapshot = true; @@ -170,6 +180,7 @@ static int __init set_trace_boot_options(char *str)  }  __setup("trace_options=", set_trace_boot_options); +  unsigned long long ns2usecs(cycle_t nsec)  {  	nsec += 500; @@ -193,6 +204,37 @@ static struct trace_array	global_trace;  LIST_HEAD(ftrace_trace_arrays); +int trace_array_get(struct trace_array *this_tr) +{ +	struct trace_array *tr; +	int ret = -ENODEV; + +	mutex_lock(&trace_types_lock); +	list_for_each_entry(tr, &ftrace_trace_arrays, list) { +		if (tr == this_tr) { +			tr->ref++; +			ret = 0; +			break; +		} +	} +	mutex_unlock(&trace_types_lock); + +	return ret; +} + +static void __trace_array_put(struct trace_array *this_tr) +{ +	WARN_ON(!this_tr->ref); +	this_tr->ref--; +} + +void trace_array_put(struct trace_array *this_tr) +{ +	mutex_lock(&trace_types_lock); +	__trace_array_put(this_tr); +	mutex_unlock(&trace_types_lock); +} +  int filter_current_check_discard(struct ring_buffer *buffer,  				 struct ftrace_event_call *call, void *rec,  				 struct ring_buffer_event *event) @@ -201,23 +243,43 @@ int filter_current_check_discard(struct ring_buffer *buffer,  }  EXPORT_SYMBOL_GPL(filter_current_check_discard); -cycle_t ftrace_now(int cpu) +cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)  {  	u64 ts;  	/* Early boot up does not have a buffer yet */ -	if (!global_trace.trace_buffer.buffer) +	if (!buf->buffer)  		return trace_clock_local(); -	ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); -	ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts); +	ts = ring_buffer_time_stamp(buf->buffer, cpu); +	ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);  	return ts;  } +cycle_t ftrace_now(int cpu) +{ +	return buffer_ftrace_now(&global_trace.trace_buffer, cpu); +} + +/** + * tracing_is_enabled - Show if global_trace has been disabled + * + * Shows if the global trace has been enabled or not. It uses the + * mirror flag "buffer_disabled" to be used in fast paths such as for + * the irqsoff tracer. But it may be inaccurate due to races. If you + * need to know the accurate state, use tracing_is_on() which is a little + * slower, but accurate. + */  int tracing_is_enabled(void)  { -	return tracing_is_on(); +	/* +	 * For quick access (irqsoff uses this in fast path), just +	 * return the mirror variable of the state of the ring buffer. +	 * It's a little racy, but we don't really care. +	 */ +	smp_rmb(); +	return !global_trace.buffer_disabled;  }  /* @@ -240,7 +302,7 @@ static struct tracer		*trace_types __read_mostly;  /*   * trace_types_lock is used to protect the trace_types list.   */ -static DEFINE_MUTEX(trace_types_lock); +DEFINE_MUTEX(trace_types_lock);  /*   * serialize the access of the ring buffer @@ -330,6 +392,23 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |  	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |  	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; +static void tracer_tracing_on(struct trace_array *tr) +{ +	if (tr->trace_buffer.buffer) +		ring_buffer_record_on(tr->trace_buffer.buffer); +	/* +	 * This flag is looked at when buffers haven't been allocated +	 * yet, or by some tracers (like irqsoff), that just want to +	 * know if the ring buffer has been disabled, but it can handle +	 * races of where it gets disabled but we still do a record. +	 * As the check is in the fast path of the tracers, it is more +	 * important to be fast than accurate. +	 */ +	tr->buffer_disabled = 0; +	/* Make the flag seen by readers */ +	smp_wmb(); +} +  /**   * tracing_on - enable tracing buffers   * @@ -338,15 +417,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |   */  void tracing_on(void)  { -	if (global_trace.trace_buffer.buffer) -		ring_buffer_record_on(global_trace.trace_buffer.buffer); -	/* -	 * This flag is only looked at when buffers haven't been -	 * allocated yet. We don't really care about the race -	 * between setting this flag and actually turning -	 * on the buffer. -	 */ -	global_trace.buffer_disabled = 0; +	tracer_tracing_on(&global_trace);  }  EXPORT_SYMBOL_GPL(tracing_on); @@ -540,6 +611,23 @@ void tracing_snapshot_alloc(void)  EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);  #endif /* CONFIG_TRACER_SNAPSHOT */ +static void tracer_tracing_off(struct trace_array *tr) +{ +	if (tr->trace_buffer.buffer) +		ring_buffer_record_off(tr->trace_buffer.buffer); +	/* +	 * This flag is looked at when buffers haven't been allocated +	 * yet, or by some tracers (like irqsoff), that just want to +	 * know if the ring buffer has been disabled, but it can handle +	 * races of where it gets disabled but we still do a record. +	 * As the check is in the fast path of the tracers, it is more +	 * important to be fast than accurate. +	 */ +	tr->buffer_disabled = 1; +	/* Make the flag seen by readers */ +	smp_wmb(); +} +  /**   * tracing_off - turn off tracing buffers   * @@ -550,26 +638,35 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);   */  void tracing_off(void)  { -	if (global_trace.trace_buffer.buffer) -		ring_buffer_record_off(global_trace.trace_buffer.buffer); -	/* -	 * This flag is only looked at when buffers haven't been -	 * allocated yet. We don't really care about the race -	 * between setting this flag and actually turning -	 * on the buffer. -	 */ -	global_trace.buffer_disabled = 1; +	tracer_tracing_off(&global_trace);  }  EXPORT_SYMBOL_GPL(tracing_off); +void disable_trace_on_warning(void) +{ +	if (__disable_trace_on_warning) +		tracing_off(); +} + +/** + * tracer_tracing_is_on - show real state of ring buffer enabled + * @tr : the trace array to know if ring buffer is enabled + * + * Shows real state of the ring buffer if it is enabled or not. + */ +static int tracer_tracing_is_on(struct trace_array *tr) +{ +	if (tr->trace_buffer.buffer) +		return ring_buffer_record_is_on(tr->trace_buffer.buffer); +	return !tr->buffer_disabled; +} +  /**   * tracing_is_on - show state of ring buffers enabled   */  int tracing_is_on(void)  { -	if (global_trace.trace_buffer.buffer) -		return ring_buffer_record_is_on(global_trace.trace_buffer.buffer); -	return !global_trace.buffer_disabled; +	return tracer_tracing_is_on(&global_trace);  }  EXPORT_SYMBOL_GPL(tracing_is_on); @@ -1119,7 +1216,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)  	/* Make sure all commits have finished */  	synchronize_sched(); -	buf->time_start = ftrace_now(buf->cpu); +	buf->time_start = buffer_ftrace_now(buf, buf->cpu);  	for_each_online_cpu(cpu)  		ring_buffer_reset_cpu(buffer, cpu); @@ -1127,23 +1224,17 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)  	ring_buffer_record_enable(buffer);  } -void tracing_reset_current(int cpu) -{ -	tracing_reset(&global_trace.trace_buffer, cpu); -} - +/* Must have trace_types_lock held */  void tracing_reset_all_online_cpus(void)  {  	struct trace_array *tr; -	mutex_lock(&trace_types_lock);  	list_for_each_entry(tr, &ftrace_trace_arrays, list) {  		tracing_reset_online_cpus(&tr->trace_buffer);  #ifdef CONFIG_TRACER_MAX_TRACE  		tracing_reset_online_cpus(&tr->max_buffer);  #endif  	} -	mutex_unlock(&trace_types_lock);  }  #define SAVED_CMDLINES 128 @@ -1543,15 +1634,6 @@ trace_function(struct trace_array *tr,  		__buffer_unlock_commit(buffer, event);  } -void -ftrace(struct trace_array *tr, struct trace_array_cpu *data, -       unsigned long ip, unsigned long parent_ip, unsigned long flags, -       int pc) -{ -	if (likely(!atomic_read(&data->disabled))) -		trace_function(tr, ip, parent_ip, flags, pc); -} -  #ifdef CONFIG_STACKTRACE  #define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) @@ -2760,6 +2842,17 @@ static int s_show(struct seq_file *m, void *v)  	return 0;  } +/* + * Should be used after trace_array_get(), trace_types_lock + * ensures that i_cdev was already initialized. + */ +static inline int tracing_get_cpu(struct inode *inode) +{ +	if (inode->i_cdev) /* See trace_create_cpu_file() */ +		return (long)inode->i_cdev - 1; +	return RING_BUFFER_ALL_CPUS; +} +  static const struct seq_operations tracer_seq_ops = {  	.start		= s_start,  	.next		= s_next, @@ -2770,8 +2863,7 @@ static const struct seq_operations tracer_seq_ops = {  static struct trace_iterator *  __tracing_open(struct inode *inode, struct file *file, bool snapshot)  { -	struct trace_cpu *tc = inode->i_private; -	struct trace_array *tr = tc->tr; +	struct trace_array *tr = inode->i_private;  	struct trace_iterator *iter;  	int cpu; @@ -2812,8 +2904,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)  		iter->trace_buffer = &tr->trace_buffer;  	iter->snapshot = snapshot;  	iter->pos = -1; +	iter->cpu_file = tracing_get_cpu(inode);  	mutex_init(&iter->mutex); -	iter->cpu_file = tc->cpu;  	/* Notify the tracer early; before we stop tracing. */  	if (iter->trace && iter->trace->open) @@ -2850,8 +2942,6 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)  		tracing_iter_reset(iter, cpu);  	} -	tr->ref++; -  	mutex_unlock(&trace_types_lock);  	return iter; @@ -2874,24 +2964,41 @@ int tracing_open_generic(struct inode *inode, struct file *filp)  	return 0;  } +/* + * Open and update trace_array ref count. + * Must have the current trace_array passed to it. + */ +static int tracing_open_generic_tr(struct inode *inode, struct file *filp) +{ +	struct trace_array *tr = inode->i_private; + +	if (tracing_disabled) +		return -ENODEV; + +	if (trace_array_get(tr) < 0) +		return -ENODEV; + +	filp->private_data = inode->i_private; + +	return 0; +} +  static int tracing_release(struct inode *inode, struct file *file)  { +	struct trace_array *tr = inode->i_private;  	struct seq_file *m = file->private_data;  	struct trace_iterator *iter; -	struct trace_array *tr;  	int cpu; -	if (!(file->f_mode & FMODE_READ)) +	if (!(file->f_mode & FMODE_READ)) { +		trace_array_put(tr);  		return 0; +	} +	/* Writes do not use seq_file */  	iter = m->private; -	tr = iter->tr; -  	mutex_lock(&trace_types_lock); -	WARN_ON(!tr->ref); -	tr->ref--; -  	for_each_tracing_cpu(cpu) {  		if (iter->buffer_iter[cpu])  			ring_buffer_read_finish(iter->buffer_iter[cpu]); @@ -2903,6 +3010,9 @@ static int tracing_release(struct inode *inode, struct file *file)  	if (!iter->snapshot)  		/* reenable tracing if it was previously enabled */  		tracing_start_tr(tr); + +	__trace_array_put(tr); +  	mutex_unlock(&trace_types_lock);  	mutex_destroy(&iter->mutex); @@ -2910,24 +3020,44 @@ static int tracing_release(struct inode *inode, struct file *file)  	kfree(iter->trace);  	kfree(iter->buffer_iter);  	seq_release_private(inode, file); + +	return 0; +} + +static int tracing_release_generic_tr(struct inode *inode, struct file *file) +{ +	struct trace_array *tr = inode->i_private; + +	trace_array_put(tr);  	return 0;  } +static int tracing_single_release_tr(struct inode *inode, struct file *file) +{ +	struct trace_array *tr = inode->i_private; + +	trace_array_put(tr); + +	return single_release(inode, file); +} +  static int tracing_open(struct inode *inode, struct file *file)  { +	struct trace_array *tr = inode->i_private;  	struct trace_iterator *iter;  	int ret = 0; +	if (trace_array_get(tr) < 0) +		return -ENODEV; +  	/* If this file was open for write, then erase contents */ -	if ((file->f_mode & FMODE_WRITE) && -	    (file->f_flags & O_TRUNC)) { -		struct trace_cpu *tc = inode->i_private; -		struct trace_array *tr = tc->tr; +	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { +		int cpu = tracing_get_cpu(inode); -		if (tc->cpu == RING_BUFFER_ALL_CPUS) +		if (cpu == RING_BUFFER_ALL_CPUS)  			tracing_reset_online_cpus(&tr->trace_buffer);  		else -			tracing_reset(&tr->trace_buffer, tc->cpu); +			tracing_reset(&tr->trace_buffer, cpu);  	}  	if (file->f_mode & FMODE_READ) { @@ -2937,6 +3067,10 @@ static int tracing_open(struct inode *inode, struct file *file)  		else if (trace_flags & TRACE_ITER_LATENCY_FMT)  			iter->iter_flags |= TRACE_FILE_LAT_FMT;  	} + +	if (ret < 0) +		trace_array_put(tr); +  	return ret;  } @@ -3293,17 +3427,27 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,  static int tracing_trace_options_open(struct inode *inode, struct file *file)  { +	struct trace_array *tr = inode->i_private; +	int ret; +  	if (tracing_disabled)  		return -ENODEV; -	return single_open(file, tracing_trace_options_show, inode->i_private); +	if (trace_array_get(tr) < 0) +		return -ENODEV; + +	ret = single_open(file, tracing_trace_options_show, inode->i_private); +	if (ret < 0) +		trace_array_put(tr); + +	return ret;  }  static const struct file_operations tracing_iter_fops = {  	.open		= tracing_trace_options_open,  	.read		= seq_read,  	.llseek		= seq_lseek, -	.release	= single_release, +	.release	= tracing_single_release_tr,  	.write		= tracing_trace_options_write,  }; @@ -3379,14 +3523,14 @@ static const char readme_msg[] =  	"\n  snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n"  	"\t\t\t  Read the contents for more information\n"  #endif -#ifdef CONFIG_STACKTRACE +#ifdef CONFIG_STACK_TRACER  	"  stack_trace\t\t- Shows the max stack trace when active\n"  	"  stack_max_size\t- Shows current max stack size that was traced\n"  	"\t\t\t  Write into this file to reset the max size (trigger a new trace)\n"  #ifdef CONFIG_DYNAMIC_FTRACE  	"  stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n"  #endif -#endif /* CONFIG_STACKTRACE */ +#endif /* CONFIG_STACK_TRACER */  ;  static ssize_t @@ -3783,20 +3927,23 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,  static int tracing_open_pipe(struct inode *inode, struct file *filp)  { -	struct trace_cpu *tc = inode->i_private; -	struct trace_array *tr = tc->tr; +	struct trace_array *tr = inode->i_private;  	struct trace_iterator *iter;  	int ret = 0;  	if (tracing_disabled)  		return -ENODEV; +	if (trace_array_get(tr) < 0) +		return -ENODEV; +  	mutex_lock(&trace_types_lock);  	/* create a buffer to store the information to pass to userspace */  	iter = kzalloc(sizeof(*iter), GFP_KERNEL);  	if (!iter) {  		ret = -ENOMEM; +		__trace_array_put(tr);  		goto out;  	} @@ -3826,9 +3973,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)  	if (trace_clocks[tr->clock_id].in_ns)  		iter->iter_flags |= TRACE_FILE_TIME_IN_NS; -	iter->cpu_file = tc->cpu; -	iter->tr = tc->tr; -	iter->trace_buffer = &tc->tr->trace_buffer; +	iter->tr = tr; +	iter->trace_buffer = &tr->trace_buffer; +	iter->cpu_file = tracing_get_cpu(inode);  	mutex_init(&iter->mutex);  	filp->private_data = iter; @@ -3843,6 +3990,7 @@ out:  fail:  	kfree(iter->trace);  	kfree(iter); +	__trace_array_put(tr);  	mutex_unlock(&trace_types_lock);  	return ret;  } @@ -3850,6 +3998,7 @@ fail:  static int tracing_release_pipe(struct inode *inode, struct file *file)  {  	struct trace_iterator *iter = file->private_data; +	struct trace_array *tr = inode->i_private;  	mutex_lock(&trace_types_lock); @@ -3863,6 +4012,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)  	kfree(iter->trace);  	kfree(iter); +	trace_array_put(tr); +  	return 0;  } @@ -3939,7 +4090,7 @@ static int tracing_wait_pipe(struct file *filp)  		 *  		 * iter->pos will be 0 if we haven't read anything.  		 */ -		if (!tracing_is_enabled() && iter->pos) +		if (!tracing_is_on() && iter->pos)  			break;  	} @@ -4000,6 +4151,7 @@ waitagain:  	memset(&iter->seq, 0,  	       sizeof(struct trace_iterator) -  	       offsetof(struct trace_iterator, seq)); +	cpumask_clear(iter->started);  	iter->pos = -1;  	trace_event_read_lock(); @@ -4200,15 +4352,16 @@ static ssize_t  tracing_entries_read(struct file *filp, char __user *ubuf,  		     size_t cnt, loff_t *ppos)  { -	struct trace_cpu *tc = filp->private_data; -	struct trace_array *tr = tc->tr; +	struct inode *inode = file_inode(filp); +	struct trace_array *tr = inode->i_private; +	int cpu = tracing_get_cpu(inode);  	char buf[64];  	int r = 0;  	ssize_t ret;  	mutex_lock(&trace_types_lock); -	if (tc->cpu == RING_BUFFER_ALL_CPUS) { +	if (cpu == RING_BUFFER_ALL_CPUS) {  		int cpu, buf_size_same;  		unsigned long size; @@ -4235,7 +4388,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,  		} else  			r = sprintf(buf, "X\n");  	} else -		r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10); +		r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10);  	mutex_unlock(&trace_types_lock); @@ -4247,7 +4400,8 @@ static ssize_t  tracing_entries_write(struct file *filp, const char __user *ubuf,  		      size_t cnt, loff_t *ppos)  { -	struct trace_cpu *tc = filp->private_data; +	struct inode *inode = file_inode(filp); +	struct trace_array *tr = inode->i_private;  	unsigned long val;  	int ret; @@ -4261,8 +4415,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,  	/* value is in KB */  	val <<= 10; - -	ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu); +	ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode));  	if (ret < 0)  		return ret; @@ -4316,10 +4469,12 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)  	/* disable tracing ? */  	if (trace_flags & TRACE_ITER_STOP_ON_FREE) -		tracing_off(); +		tracer_tracing_off(tr);  	/* resize the ring buffer to 0 */  	tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); +	trace_array_put(tr); +  	return 0;  } @@ -4328,6 +4483,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  					size_t cnt, loff_t *fpos)  {  	unsigned long addr = (unsigned long)ubuf; +	struct trace_array *tr = filp->private_data;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer;  	struct print_entry *entry; @@ -4387,7 +4543,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,  	local_save_flags(irq_flags);  	size = sizeof(*entry) + cnt + 2; /* possible \n added */ -	buffer = global_trace.trace_buffer.buffer; +	buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,  					  irq_flags, preempt_count());  	if (!event) { @@ -4478,12 +4634,12 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,  	 * New clock may not be consistent with the previous clock.  	 * Reset the buffer so that it doesn't have incomparable timestamps.  	 */ -	tracing_reset_online_cpus(&global_trace.trace_buffer); +	tracing_reset_online_cpus(&tr->trace_buffer);  #ifdef CONFIG_TRACER_MAX_TRACE  	if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)  		ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); -	tracing_reset_online_cpus(&global_trace.max_buffer); +	tracing_reset_online_cpus(&tr->max_buffer);  #endif  	mutex_unlock(&trace_types_lock); @@ -4495,10 +4651,20 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,  static int tracing_clock_open(struct inode *inode, struct file *file)  { +	struct trace_array *tr = inode->i_private; +	int ret; +  	if (tracing_disabled)  		return -ENODEV; -	return single_open(file, tracing_clock_show, inode->i_private); +	if (trace_array_get(tr)) +		return -ENODEV; + +	ret = single_open(file, tracing_clock_show, inode->i_private); +	if (ret < 0) +		trace_array_put(tr); + +	return ret;  }  struct ftrace_buffer_info { @@ -4510,31 +4676,40 @@ struct ftrace_buffer_info {  #ifdef CONFIG_TRACER_SNAPSHOT  static int tracing_snapshot_open(struct inode *inode, struct file *file)  { -	struct trace_cpu *tc = inode->i_private; +	struct trace_array *tr = inode->i_private;  	struct trace_iterator *iter;  	struct seq_file *m;  	int ret = 0; +	if (trace_array_get(tr) < 0) +		return -ENODEV; +  	if (file->f_mode & FMODE_READ) {  		iter = __tracing_open(inode, file, true);  		if (IS_ERR(iter))  			ret = PTR_ERR(iter);  	} else {  		/* Writes still need the seq_file to hold the private data */ +		ret = -ENOMEM;  		m = kzalloc(sizeof(*m), GFP_KERNEL);  		if (!m) -			return -ENOMEM; +			goto out;  		iter = kzalloc(sizeof(*iter), GFP_KERNEL);  		if (!iter) {  			kfree(m); -			return -ENOMEM; +			goto out;  		} -		iter->tr = tc->tr; -		iter->trace_buffer = &tc->tr->max_buffer; -		iter->cpu_file = tc->cpu; +		ret = 0; + +		iter->tr = tr; +		iter->trace_buffer = &tr->max_buffer; +		iter->cpu_file = tracing_get_cpu(inode);  		m->private = iter;  		file->private_data = m;  	} +out: +	if (ret < 0) +		trace_array_put(tr);  	return ret;  } @@ -4616,9 +4791,12 @@ out:  static int tracing_snapshot_release(struct inode *inode, struct file *file)  {  	struct seq_file *m = file->private_data; +	int ret; + +	ret = tracing_release(inode, file);  	if (file->f_mode & FMODE_READ) -		return tracing_release(inode, file); +		return ret;  	/* If write only, the seq_file is just a stub */  	if (m) @@ -4684,34 +4862,38 @@ static const struct file_operations tracing_pipe_fops = {  };  static const struct file_operations tracing_entries_fops = { -	.open		= tracing_open_generic, +	.open		= tracing_open_generic_tr,  	.read		= tracing_entries_read,  	.write		= tracing_entries_write,  	.llseek		= generic_file_llseek, +	.release	= tracing_release_generic_tr,  };  static const struct file_operations tracing_total_entries_fops = { -	.open		= tracing_open_generic, +	.open		= tracing_open_generic_tr,  	.read		= tracing_total_entries_read,  	.llseek		= generic_file_llseek, +	.release	= tracing_release_generic_tr,  };  static const struct file_operations tracing_free_buffer_fops = { +	.open		= tracing_open_generic_tr,  	.write		= tracing_free_buffer_write,  	.release	= tracing_free_buffer_release,  };  static const struct file_operations tracing_mark_fops = { -	.open		= tracing_open_generic, +	.open		= tracing_open_generic_tr,  	.write		= tracing_mark_write,  	.llseek		= generic_file_llseek, +	.release	= tracing_release_generic_tr,  };  static const struct file_operations trace_clock_fops = {  	.open		= tracing_clock_open,  	.read		= seq_read,  	.llseek		= seq_lseek, -	.release	= single_release, +	.release	= tracing_single_release_tr,  	.write		= tracing_clock_write,  }; @@ -4736,23 +4918,26 @@ static const struct file_operations snapshot_raw_fops = {  static int tracing_buffers_open(struct inode *inode, struct file *filp)  { -	struct trace_cpu *tc = inode->i_private; -	struct trace_array *tr = tc->tr; +	struct trace_array *tr = inode->i_private;  	struct ftrace_buffer_info *info; +	int ret;  	if (tracing_disabled)  		return -ENODEV; +	if (trace_array_get(tr) < 0) +		return -ENODEV; +  	info = kzalloc(sizeof(*info), GFP_KERNEL); -	if (!info) +	if (!info) { +		trace_array_put(tr);  		return -ENOMEM; +	}  	mutex_lock(&trace_types_lock); -	tr->ref++; -  	info->iter.tr		= tr; -	info->iter.cpu_file	= tc->cpu; +	info->iter.cpu_file	= tracing_get_cpu(inode);  	info->iter.trace	= tr->current_trace;  	info->iter.trace_buffer = &tr->trace_buffer;  	info->spare		= NULL; @@ -4763,7 +4948,11 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)  	mutex_unlock(&trace_types_lock); -	return nonseekable_open(inode, filp); +	ret = nonseekable_open(inode, filp); +	if (ret < 0) +		trace_array_put(tr); + +	return ret;  }  static unsigned int @@ -4863,8 +5052,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)  	mutex_lock(&trace_types_lock); -	WARN_ON(!iter->tr->ref); -	iter->tr->ref--; +	__trace_array_put(iter->tr);  	if (info->spare)  		ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); @@ -5066,14 +5254,14 @@ static ssize_t  tracing_stats_read(struct file *filp, char __user *ubuf,  		   size_t count, loff_t *ppos)  { -	struct trace_cpu *tc = filp->private_data; -	struct trace_array *tr = tc->tr; +	struct inode *inode = file_inode(filp); +	struct trace_array *tr = inode->i_private;  	struct trace_buffer *trace_buf = &tr->trace_buffer; +	int cpu = tracing_get_cpu(inode);  	struct trace_seq *s;  	unsigned long cnt;  	unsigned long long t;  	unsigned long usec_rem; -	int cpu = tc->cpu;  	s = kmalloc(sizeof(*s), GFP_KERNEL);  	if (!s) @@ -5126,9 +5314,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf,  }  static const struct file_operations tracing_stats_fops = { -	.open		= tracing_open_generic, +	.open		= tracing_open_generic_tr,  	.read		= tracing_stats_read,  	.llseek		= generic_file_llseek, +	.release	= tracing_release_generic_tr,  };  #ifdef CONFIG_DYNAMIC_FTRACE @@ -5317,10 +5506,20 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)  	return tr->percpu_dir;  } +static struct dentry * +trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, +		      void *data, long cpu, const struct file_operations *fops) +{ +	struct dentry *ret = trace_create_file(name, mode, parent, data, fops); + +	if (ret) /* See tracing_get_cpu() */ +		ret->d_inode->i_cdev = (void *)(cpu + 1); +	return ret; +} +  static void  tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)  { -	struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);  	struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);  	struct dentry *d_cpu;  	char cpu_dir[30]; /* 30 characters should be more than enough */ @@ -5336,28 +5535,28 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)  	}  	/* per cpu trace_pipe */ -	trace_create_file("trace_pipe", 0444, d_cpu, -			(void *)&data->trace_cpu, &tracing_pipe_fops); +	trace_create_cpu_file("trace_pipe", 0444, d_cpu, +				tr, cpu, &tracing_pipe_fops);  	/* per cpu trace */ -	trace_create_file("trace", 0644, d_cpu, -			(void *)&data->trace_cpu, &tracing_fops); +	trace_create_cpu_file("trace", 0644, d_cpu, +				tr, cpu, &tracing_fops); -	trace_create_file("trace_pipe_raw", 0444, d_cpu, -			(void *)&data->trace_cpu, &tracing_buffers_fops); +	trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, +				tr, cpu, &tracing_buffers_fops); -	trace_create_file("stats", 0444, d_cpu, -			(void *)&data->trace_cpu, &tracing_stats_fops); +	trace_create_cpu_file("stats", 0444, d_cpu, +				tr, cpu, &tracing_stats_fops); -	trace_create_file("buffer_size_kb", 0444, d_cpu, -			(void *)&data->trace_cpu, &tracing_entries_fops); +	trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, +				tr, cpu, &tracing_entries_fops);  #ifdef CONFIG_TRACER_SNAPSHOT -	trace_create_file("snapshot", 0644, d_cpu, -			  (void *)&data->trace_cpu, &snapshot_fops); +	trace_create_cpu_file("snapshot", 0644, d_cpu, +				tr, cpu, &snapshot_fops); -	trace_create_file("snapshot_raw", 0444, d_cpu, -			(void *)&data->trace_cpu, &snapshot_raw_fops); +	trace_create_cpu_file("snapshot_raw", 0444, d_cpu, +				tr, cpu, &snapshot_raw_fops);  #endif  } @@ -5612,15 +5811,10 @@ rb_simple_read(struct file *filp, char __user *ubuf,  	       size_t cnt, loff_t *ppos)  {  	struct trace_array *tr = filp->private_data; -	struct ring_buffer *buffer = tr->trace_buffer.buffer;  	char buf[64];  	int r; -	if (buffer) -		r = ring_buffer_record_is_on(buffer); -	else -		r = 0; - +	r = tracer_tracing_is_on(tr);  	r = sprintf(buf, "%d\n", r);  	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -5642,11 +5836,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,  	if (buffer) {  		mutex_lock(&trace_types_lock);  		if (val) { -			ring_buffer_record_on(buffer); +			tracer_tracing_on(tr);  			if (tr->current_trace->start)  				tr->current_trace->start(tr);  		} else { -			ring_buffer_record_off(buffer); +			tracer_tracing_off(tr);  			if (tr->current_trace->stop)  				tr->current_trace->stop(tr);  		} @@ -5659,9 +5853,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,  }  static const struct file_operations rb_simple_fops = { -	.open		= tracing_open_generic, +	.open		= tracing_open_generic_tr,  	.read		= rb_simple_read,  	.write		= rb_simple_write, +	.release	= tracing_release_generic_tr,  	.llseek		= default_llseek,  }; @@ -5670,17 +5865,6 @@ struct dentry *trace_instance_dir;  static void  init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); -static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) -{ -	int cpu; - -	for_each_tracing_cpu(cpu) { -		memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu)); -		per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu; -		per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr; -	} -} -  static int  allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)  { @@ -5698,8 +5882,6 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size  		return -ENOMEM;  	} -	init_trace_buffers(tr, buf); -  	/* Allocate the first page for all buffers */  	set_buffer_entries(&tr->trace_buffer,  			   ring_buffer_size(tr->trace_buffer.buffer, 0)); @@ -5766,17 +5948,15 @@ static int new_instance_create(const char *name)  	if (allocate_trace_buffers(tr, trace_buf_size) < 0)  		goto out_free_tr; -	/* Holder for file callbacks */ -	tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; -	tr->trace_cpu.tr = tr; -  	tr->dir = debugfs_create_dir(name, trace_instance_dir);  	if (!tr->dir)  		goto out_free_tr;  	ret = event_trace_add_tracer(tr->dir, tr); -	if (ret) +	if (ret) { +		debugfs_remove_recursive(tr->dir);  		goto out_free_tr; +	}  	init_tracer_debugfs(tr, tr->dir); @@ -5922,18 +6102,18 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)  			  tr, &tracing_iter_fops);  	trace_create_file("trace", 0644, d_tracer, -			(void *)&tr->trace_cpu, &tracing_fops); +			  tr, &tracing_fops);  	trace_create_file("trace_pipe", 0444, d_tracer, -			(void *)&tr->trace_cpu, &tracing_pipe_fops); +			  tr, &tracing_pipe_fops);  	trace_create_file("buffer_size_kb", 0644, d_tracer, -			(void *)&tr->trace_cpu, &tracing_entries_fops); +			  tr, &tracing_entries_fops);  	trace_create_file("buffer_total_size_kb", 0444, d_tracer,  			  tr, &tracing_total_entries_fops); -	trace_create_file("free_buffer", 0644, d_tracer, +	trace_create_file("free_buffer", 0200, d_tracer,  			  tr, &tracing_free_buffer_fops);  	trace_create_file("trace_marker", 0220, d_tracer, @@ -5943,11 +6123,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)  			  &trace_clock_fops);  	trace_create_file("tracing_on", 0644, d_tracer, -			    tr, &rb_simple_fops); +			  tr, &rb_simple_fops);  #ifdef CONFIG_TRACER_SNAPSHOT  	trace_create_file("snapshot", 0644, d_tracer, -			  (void *)&tr->trace_cpu, &snapshot_fops); +			  tr, &snapshot_fops);  #endif  	for_each_tracing_cpu(cpu) @@ -6241,10 +6421,6 @@ __init static int tracer_alloc_buffers(void)  	global_trace.flags = TRACE_ARRAY_FL_GLOBAL; -	/* Holder for file callbacks */ -	global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS; -	global_trace.trace_cpu.tr = &global_trace; -  	INIT_LIST_HEAD(&global_trace.systems);  	INIT_LIST_HEAD(&global_trace.events);  	list_add(&global_trace.list, &ftrace_trace_arrays); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 20572ed88c5c..afaae41b0a02 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -130,19 +130,12 @@ enum trace_flag_type {  struct trace_array; -struct trace_cpu { -	struct trace_array	*tr; -	struct dentry		*dir; -	int			cpu; -}; -  /*   * The CPU trace array - it consists of thousands of trace entries   * plus some other descriptor data: (for example which task started   * the trace, etc.)   */  struct trace_array_cpu { -	struct trace_cpu	trace_cpu;  	atomic_t		disabled;  	void			*buffer_page;	/* ring buffer spare */ @@ -196,7 +189,6 @@ struct trace_array {  	bool			allocated_snapshot;  #endif  	int			buffer_disabled; -	struct trace_cpu	trace_cpu;	/* place holder */  #ifdef CONFIG_FTRACE_SYSCALLS  	int			sys_refcount_enter;  	int			sys_refcount_exit; @@ -214,7 +206,6 @@ struct trace_array {  	struct dentry		*event_dir;  	struct list_head	systems;  	struct list_head	events; -	struct task_struct	*waiter;  	int			ref;  }; @@ -224,6 +215,11 @@ enum {  extern struct list_head ftrace_trace_arrays; +extern struct mutex trace_types_lock; + +extern int trace_array_get(struct trace_array *tr); +extern void trace_array_put(struct trace_array *tr); +  /*   * The global tracer (top) should be the first trace array added,   * but we check the flag anyway. @@ -554,11 +550,6 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu);  void poll_wait_pipe(struct trace_iterator *iter); -void ftrace(struct trace_array *tr, -			    struct trace_array_cpu *data, -			    unsigned long ip, -			    unsigned long parent_ip, -			    unsigned long flags, int pc);  void tracing_sched_switch_trace(struct trace_array *tr,  				struct task_struct *prev,  				struct task_struct *next, @@ -680,6 +671,15 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,  					       struct trace_array *tr);  extern int trace_selftest_startup_branch(struct tracer *trace,  					 struct trace_array *tr); +/* + * Tracer data references selftest functions that only occur + * on boot up. These can be __init functions. Thus, when selftests + * are enabled, then the tracers need to reference __init functions. + */ +#define __tracer_data		__refdata +#else +/* Tracers are seldom changed. Optimize when selftests are disabled. */ +#define __tracer_data		__read_mostly  #endif /* CONFIG_FTRACE_STARTUP_TEST */  extern void *head_page(struct trace_array_cpu *data); @@ -774,6 +774,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)  extern struct list_head ftrace_pids;  #ifdef CONFIG_FUNCTION_TRACER +extern bool ftrace_filter_param __initdata;  static inline int ftrace_trace_task(struct task_struct *task)  {  	if (list_empty(&ftrace_pids)) @@ -899,12 +900,6 @@ static inline void trace_branch_disable(void)  /* set ring buffers to default size if not already done so */  int tracing_update_buffers(void); -/* trace event type bit fields, not numeric */ -enum { -	TRACE_EVENT_TYPE_PRINTF		= 1, -	TRACE_EVENT_TYPE_RAW		= 2, -}; -  struct ftrace_event_field {  	struct list_head	link;  	const char		*name; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 84b1e045faba..80c36bcf66e8 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -236,6 +236,10 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,  	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); +	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, +			"perf buffer not large enough")) +		return NULL; +  	pc = preempt_count();  	*rctxp = perf_swevent_get_recursion_context(); @@ -266,6 +270,10 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,  	struct pt_regs regs;  	int rctx; +	head = this_cpu_ptr(event_function.perf_events); +	if (hlist_empty(head)) +		return; +  #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \  		    sizeof(u64)) - sizeof(u32)) @@ -279,8 +287,6 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,  	entry->ip = ip;  	entry->parent_ip = parent_ip; - -	head = this_cpu_ptr(event_function.perf_events);  	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,  			      1, ®s, head, NULL); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 27963e2bf4bf..29a7ebcfb426 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -41,6 +41,23 @@ static LIST_HEAD(ftrace_common_fields);  static struct kmem_cache *field_cachep;  static struct kmem_cache *file_cachep; +#define SYSTEM_FL_FREE_NAME		(1 << 31) + +static inline int system_refcount(struct event_subsystem *system) +{ +	return system->ref_count & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_inc(struct event_subsystem *system) +{ +	return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME; +} + +static int system_refcount_dec(struct event_subsystem *system) +{ +	return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME; +} +  /* Double loops, do not use break, only goto's work */  #define do_for_each_event_file(tr, file)			\  	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\ @@ -97,7 +114,7 @@ static int __trace_define_field(struct list_head *head, const char *type,  	field = kmem_cache_alloc(field_cachep, GFP_TRACE);  	if (!field) -		goto err; +		return -ENOMEM;  	field->name = name;  	field->type = type; @@ -114,11 +131,6 @@ static int __trace_define_field(struct list_head *head, const char *type,  	list_add(&field->link, head);  	return 0; - -err: -	kmem_cache_free(field_cachep, field); - -	return -ENOMEM;  }  int trace_define_field(struct ftrace_event_call *call, const char *type, @@ -279,9 +291,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,  			}  			call->class->reg(call, TRACE_REG_UNREGISTER, file);  		} -		/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ +		/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */  		if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)  			set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); +		else +			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);  		break;  	case 1:  		/* @@ -349,8 +363,8 @@ static void __put_system(struct event_subsystem *system)  {  	struct event_filter *filter = system->filter; -	WARN_ON_ONCE(system->ref_count == 0); -	if (--system->ref_count) +	WARN_ON_ONCE(system_refcount(system) == 0); +	if (system_refcount_dec(system))  		return;  	list_del(&system->list); @@ -359,13 +373,15 @@ static void __put_system(struct event_subsystem *system)  		kfree(filter->filter_string);  		kfree(filter);  	} +	if (system->ref_count & SYSTEM_FL_FREE_NAME) +		kfree(system->name);  	kfree(system);  }  static void __get_system(struct event_subsystem *system)  { -	WARN_ON_ONCE(system->ref_count == 0); -	system->ref_count++; +	WARN_ON_ONCE(system_refcount(system) == 0); +	system_refcount_inc(system);  }  static void __get_system_dir(struct ftrace_subsystem_dir *dir) @@ -379,7 +395,7 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir)  {  	WARN_ON_ONCE(dir->ref_count == 0);  	/* If the subsystem is about to be freed, the dir must be too */ -	WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); +	WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1);  	__put_system(dir->subsystem);  	if (!--dir->ref_count) @@ -393,17 +409,55 @@ static void put_system(struct ftrace_subsystem_dir *dir)  	mutex_unlock(&event_mutex);  } +static void remove_subsystem(struct ftrace_subsystem_dir *dir) +{ +	if (!dir) +		return; + +	if (!--dir->nr_events) { +		debugfs_remove_recursive(dir->entry); +		list_del(&dir->list); +		__put_system_dir(dir); +	} +} + +static void *event_file_data(struct file *filp) +{ +	return ACCESS_ONCE(file_inode(filp)->i_private); +} + +static void remove_event_file_dir(struct ftrace_event_file *file) +{ +	struct dentry *dir = file->dir; +	struct dentry *child; + +	if (dir) { +		spin_lock(&dir->d_lock);	/* probably unneeded */ +		list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) { +			if (child->d_inode)	/* probably unneeded */ +				child->d_inode->i_private = NULL; +		} +		spin_unlock(&dir->d_lock); + +		debugfs_remove_recursive(dir); +	} + +	list_del(&file->list); +	remove_subsystem(file->system); +	kmem_cache_free(file_cachep, file); +} +  /*   * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.   */ -static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, -				  const char *sub, const char *event, int set) +static int +__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match, +			      const char *sub, const char *event, int set)  {  	struct ftrace_event_file *file;  	struct ftrace_event_call *call;  	int ret = -EINVAL; -	mutex_lock(&event_mutex);  	list_for_each_entry(file, &tr->events, list) {  		call = file->event_call; @@ -429,6 +483,17 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,  		ret = 0;  	} + +	return ret; +} + +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, +				  const char *sub, const char *event, int set) +{ +	int ret; + +	mutex_lock(&event_mutex); +	ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);  	mutex_unlock(&event_mutex);  	return ret; @@ -623,18 +688,28 @@ static ssize_t  event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,  		  loff_t *ppos)  { -	struct ftrace_event_file *file = filp->private_data; -	char *buf; +	struct ftrace_event_file *file; +	unsigned long flags; +	char buf[4] = "0"; -	if (file->flags & FTRACE_EVENT_FL_ENABLED) { -		if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) -			buf = "0*\n"; -		else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) -			buf = "1*\n"; -		else -			buf = "1\n"; -	} else -		buf = "0\n"; +	mutex_lock(&event_mutex); +	file = event_file_data(filp); +	if (likely(file)) +		flags = file->flags; +	mutex_unlock(&event_mutex); + +	if (!file) +		return -ENODEV; + +	if (flags & FTRACE_EVENT_FL_ENABLED && +	    !(flags & FTRACE_EVENT_FL_SOFT_DISABLED)) +		strcpy(buf, "1"); + +	if (flags & FTRACE_EVENT_FL_SOFT_DISABLED || +	    flags & FTRACE_EVENT_FL_SOFT_MODE) +		strcat(buf, "*"); + +	strcat(buf, "\n");  	return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));  } @@ -643,13 +718,10 @@ static ssize_t  event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  		   loff_t *ppos)  { -	struct ftrace_event_file *file = filp->private_data; +	struct ftrace_event_file *file;  	unsigned long val;  	int ret; -	if (!file) -		return -EINVAL; -  	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);  	if (ret)  		return ret; @@ -661,8 +733,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,  	switch (val) {  	case 0:  	case 1: +		ret = -ENODEV;  		mutex_lock(&event_mutex); -		ret = ftrace_event_enable_disable(file, val); +		file = event_file_data(filp); +		if (likely(file)) +			ret = ftrace_event_enable_disable(file, val);  		mutex_unlock(&event_mutex);  		break; @@ -769,65 +844,39 @@ enum {  static void *f_next(struct seq_file *m, void *v, loff_t *pos)  { -	struct ftrace_event_call *call = m->private; -	struct ftrace_event_field *field; +	struct ftrace_event_call *call = event_file_data(m->private);  	struct list_head *common_head = &ftrace_common_fields;  	struct list_head *head = trace_get_fields(call); +	struct list_head *node = v;  	(*pos)++;  	switch ((unsigned long)v) {  	case FORMAT_HEADER: -		if (unlikely(list_empty(common_head))) -			return NULL; - -		field = list_entry(common_head->prev, -				   struct ftrace_event_field, link); -		return field; +		node = common_head; +		break;  	case FORMAT_FIELD_SEPERATOR: -		if (unlikely(list_empty(head))) -			return NULL; - -		field = list_entry(head->prev, struct ftrace_event_field, link); -		return field; +		node = head; +		break;  	case FORMAT_PRINTFMT:  		/* all done */  		return NULL;  	} -	field = v; -	if (field->link.prev == common_head) +	node = node->prev; +	if (node == common_head)  		return (void *)FORMAT_FIELD_SEPERATOR; -	else if (field->link.prev == head) +	else if (node == head)  		return (void *)FORMAT_PRINTFMT; - -	field = list_entry(field->link.prev, struct ftrace_event_field, link); - -	return field; -} - -static void *f_start(struct seq_file *m, loff_t *pos) -{ -	loff_t l = 0; -	void *p; - -	/* Start by showing the header */ -	if (!*pos) -		return (void *)FORMAT_HEADER; - -	p = (void *)FORMAT_HEADER; -	do { -		p = f_next(m, p, &l); -	} while (p && l < *pos); - -	return p; +	else +		return node;  }  static int f_show(struct seq_file *m, void *v)  { -	struct ftrace_event_call *call = m->private; +	struct ftrace_event_call *call = event_file_data(m->private);  	struct ftrace_event_field *field;  	const char *array_descriptor; @@ -848,8 +897,7 @@ static int f_show(struct seq_file *m, void *v)  		return 0;  	} -	field = v; - +	field = list_entry(v, struct ftrace_event_field, link);  	/*  	 * Smartly shows the array type(except dynamic array).  	 * Normal: @@ -876,8 +924,25 @@ static int f_show(struct seq_file *m, void *v)  	return 0;  } +static void *f_start(struct seq_file *m, loff_t *pos) +{ +	void *p = (void *)FORMAT_HEADER; +	loff_t l = 0; + +	/* ->stop() is called even if ->start() fails */ +	mutex_lock(&event_mutex); +	if (!event_file_data(m->private)) +		return ERR_PTR(-ENODEV); + +	while (l < *pos && p) +		p = f_next(m, p, &l); + +	return p; +} +  static void f_stop(struct seq_file *m, void *p)  { +	mutex_unlock(&event_mutex);  }  static const struct seq_operations trace_format_seq_ops = { @@ -889,7 +954,6 @@ static const struct seq_operations trace_format_seq_ops = {  static int trace_format_open(struct inode *inode, struct file *file)  { -	struct ftrace_event_call *call = inode->i_private;  	struct seq_file *m;  	int ret; @@ -898,7 +962,7 @@ static int trace_format_open(struct inode *inode, struct file *file)  		return ret;  	m = file->private_data; -	m->private = call; +	m->private = file;  	return 0;  } @@ -906,45 +970,47 @@ static int trace_format_open(struct inode *inode, struct file *file)  static ssize_t  event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; -	struct trace_seq *s; -	int r; +	int id = (long)event_file_data(filp); +	char buf[32]; +	int len;  	if (*ppos)  		return 0; -	s = kmalloc(sizeof(*s), GFP_KERNEL); -	if (!s) -		return -ENOMEM; +	if (unlikely(!id)) +		return -ENODEV; -	trace_seq_init(s); -	trace_seq_printf(s, "%d\n", call->event.type); +	len = sprintf(buf, "%d\n", id); -	r = simple_read_from_buffer(ubuf, cnt, ppos, -				    s->buffer, s->len); -	kfree(s); -	return r; +	return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);  }  static ssize_t  event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,  		  loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; +	struct ftrace_event_call *call;  	struct trace_seq *s; -	int r; +	int r = -ENODEV;  	if (*ppos)  		return 0;  	s = kmalloc(sizeof(*s), GFP_KERNEL); +  	if (!s)  		return -ENOMEM;  	trace_seq_init(s); -	print_event_filter(call, s); -	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); +	mutex_lock(&event_mutex); +	call = event_file_data(filp); +	if (call) +		print_event_filter(call, s); +	mutex_unlock(&event_mutex); + +	if (call) +		r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);  	kfree(s); @@ -955,9 +1021,9 @@ static ssize_t  event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  		   loff_t *ppos)  { -	struct ftrace_event_call *call = filp->private_data; +	struct ftrace_event_call *call;  	char *buf; -	int err; +	int err = -ENODEV;  	if (cnt >= PAGE_SIZE)  		return -EINVAL; @@ -972,7 +1038,12 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,  	}  	buf[cnt] = '\0'; -	err = apply_event_filter(call, buf); +	mutex_lock(&event_mutex); +	call = event_file_data(filp); +	if (call) +		err = apply_event_filter(call, buf); +	mutex_unlock(&event_mutex); +  	free_page((unsigned long) buf);  	if (err < 0)  		return err; @@ -992,6 +1063,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)  	int ret;  	/* Make sure the system still exists */ +	mutex_lock(&trace_types_lock);  	mutex_lock(&event_mutex);  	list_for_each_entry(tr, &ftrace_trace_arrays, list) {  		list_for_each_entry(dir, &tr->systems, list) { @@ -1007,6 +1079,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)  	}   exit_loop:  	mutex_unlock(&event_mutex); +	mutex_unlock(&trace_types_lock);  	if (!system)  		return -ENODEV; @@ -1014,9 +1087,17 @@ static int subsystem_open(struct inode *inode, struct file *filp)  	/* Some versions of gcc think dir can be uninitialized here */  	WARN_ON(!dir); +	/* Still need to increment the ref count of the system */ +	if (trace_array_get(tr) < 0) { +		put_system(dir); +		return -ENODEV; +	} +  	ret = tracing_open_generic(inode, filp); -	if (ret < 0) +	if (ret < 0) { +		trace_array_put(tr);  		put_system(dir); +	}  	return ret;  } @@ -1027,16 +1108,23 @@ static int system_tr_open(struct inode *inode, struct file *filp)  	struct trace_array *tr = inode->i_private;  	int ret; +	if (trace_array_get(tr) < 0) +		return -ENODEV; +  	/* Make a temporary dir that has no system but points to tr */  	dir = kzalloc(sizeof(*dir), GFP_KERNEL); -	if (!dir) +	if (!dir) { +		trace_array_put(tr);  		return -ENOMEM; +	}  	dir->tr = tr;  	ret = tracing_open_generic(inode, filp); -	if (ret < 0) +	if (ret < 0) { +		trace_array_put(tr);  		kfree(dir); +	}  	filp->private_data = dir; @@ -1047,6 +1135,8 @@ static int subsystem_release(struct inode *inode, struct file *file)  {  	struct ftrace_subsystem_dir *dir = file->private_data; +	trace_array_put(dir->tr); +  	/*  	 * If dir->subsystem is NULL, then this is a temporary  	 * descriptor that was made for a trace_array to enable @@ -1143,6 +1233,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)  static int ftrace_event_avail_open(struct inode *inode, struct file *file);  static int ftrace_event_set_open(struct inode *inode, struct file *file); +static int ftrace_event_release(struct inode *inode, struct file *file);  static const struct seq_operations show_event_seq_ops = {  	.start = t_start, @@ -1170,7 +1261,7 @@ static const struct file_operations ftrace_set_event_fops = {  	.read = seq_read,  	.write = ftrace_event_write,  	.llseek = seq_lseek, -	.release = seq_release, +	.release = ftrace_event_release,  };  static const struct file_operations ftrace_enable_fops = { @@ -1188,7 +1279,6 @@ static const struct file_operations ftrace_event_format_fops = {  };  static const struct file_operations ftrace_event_id_fops = { -	.open = tracing_open_generic,  	.read = event_id_read,  	.llseek = default_llseek,  }; @@ -1247,6 +1337,15 @@ ftrace_event_open(struct inode *inode, struct file *file,  	return ret;  } +static int ftrace_event_release(struct inode *inode, struct file *file) +{ +	struct trace_array *tr = inode->i_private; + +	trace_array_put(tr); + +	return seq_release(inode, file); +} +  static int  ftrace_event_avail_open(struct inode *inode, struct file *file)  { @@ -1260,12 +1359,19 @@ ftrace_event_set_open(struct inode *inode, struct file *file)  {  	const struct seq_operations *seq_ops = &show_set_event_seq_ops;  	struct trace_array *tr = inode->i_private; +	int ret; + +	if (trace_array_get(tr) < 0) +		return -ENODEV;  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC))  		ftrace_clear_events(tr); -	return ftrace_event_open(inode, file, seq_ops); +	ret = ftrace_event_open(inode, file, seq_ops); +	if (ret < 0) +		trace_array_put(tr); +	return ret;  }  static struct event_subsystem * @@ -1279,7 +1385,15 @@ create_new_subsystem(const char *name)  		return NULL;  	system->ref_count = 1; -	system->name = name; + +	/* Only allocate if dynamic (kprobes and modules) */ +	if (!core_kernel_data((unsigned long)name)) { +		system->ref_count |= SYSTEM_FL_FREE_NAME; +		system->name = kstrdup(name, GFP_KERNEL); +		if (!system->name) +			goto out_free; +	} else +		system->name = name;  	system->filter = NULL; @@ -1292,6 +1406,8 @@ create_new_subsystem(const char *name)  	return system;   out_free: +	if (system->ref_count & SYSTEM_FL_FREE_NAME) +		kfree(system->name);  	kfree(system);  	return NULL;  } @@ -1410,8 +1526,8 @@ event_create_dir(struct dentry *parent,  #ifdef CONFIG_PERF_EVENTS  	if (call->event.type && call->class->reg) -		trace_create_file("id", 0444, file->dir, call, -		 		  id); +		trace_create_file("id", 0444, file->dir, +				  (void *)(long)call->event.type, id);  #endif  	/* @@ -1436,33 +1552,16 @@ event_create_dir(struct dentry *parent,  	return 0;  } -static void remove_subsystem(struct ftrace_subsystem_dir *dir) -{ -	if (!dir) -		return; - -	if (!--dir->nr_events) { -		debugfs_remove_recursive(dir->entry); -		list_del(&dir->list); -		__put_system_dir(dir); -	} -} -  static void remove_event_from_tracers(struct ftrace_event_call *call)  {  	struct ftrace_event_file *file;  	struct trace_array *tr;  	do_for_each_event_file_safe(tr, file) { -  		if (file->event_call != call)  			continue; -		list_del(&file->list); -		debugfs_remove_recursive(file->dir); -		remove_subsystem(file->system); -		kmem_cache_free(file_cachep, file); - +		remove_event_file_dir(file);  		/*  		 * The do_for_each_event_file_safe() is  		 * a double loop. After finding the call for this @@ -1591,6 +1690,7 @@ static void __add_event_to_tracers(struct ftrace_event_call *call,  int trace_add_event_call(struct ftrace_event_call *call)  {  	int ret; +	mutex_lock(&trace_types_lock);  	mutex_lock(&event_mutex);  	ret = __register_event(call, NULL); @@ -1598,11 +1698,13 @@ int trace_add_event_call(struct ftrace_event_call *call)  		__add_event_to_tracers(call, NULL);  	mutex_unlock(&event_mutex); +	mutex_unlock(&trace_types_lock);  	return ret;  }  /* - * Must be called under locking both of event_mutex and trace_event_sem. + * Must be called under locking of trace_types_lock, event_mutex and + * trace_event_sem.   */  static void __trace_remove_event_call(struct ftrace_event_call *call)  { @@ -1611,14 +1713,53 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)  	destroy_preds(call);  } +static int probe_remove_event_call(struct ftrace_event_call *call) +{ +	struct trace_array *tr; +	struct ftrace_event_file *file; + +#ifdef CONFIG_PERF_EVENTS +	if (call->perf_refcount) +		return -EBUSY; +#endif +	do_for_each_event_file(tr, file) { +		if (file->event_call != call) +			continue; +		/* +		 * We can't rely on ftrace_event_enable_disable(enable => 0) +		 * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress +		 * TRACE_REG_UNREGISTER. +		 */ +		if (file->flags & FTRACE_EVENT_FL_ENABLED) +			return -EBUSY; +		/* +		 * The do_for_each_event_file_safe() is +		 * a double loop. After finding the call for this +		 * trace_array, we use break to jump to the next +		 * trace_array. +		 */ +		break; +	} while_for_each_event_file(); + +	__trace_remove_event_call(call); + +	return 0; +} +  /* Remove an event_call */ -void trace_remove_event_call(struct ftrace_event_call *call) +int trace_remove_event_call(struct ftrace_event_call *call)  { +	int ret; + +	mutex_lock(&trace_types_lock);  	mutex_lock(&event_mutex);  	down_write(&trace_event_sem); -	__trace_remove_event_call(call); +	ret = probe_remove_event_call(call);  	up_write(&trace_event_sem);  	mutex_unlock(&event_mutex); +	mutex_unlock(&trace_types_lock); + +	return ret;  }  #define for_each_event(event, start, end)			\ @@ -1762,6 +1903,7 @@ static int trace_module_notify(struct notifier_block *self,  {  	struct module *mod = data; +	mutex_lock(&trace_types_lock);  	mutex_lock(&event_mutex);  	switch (val) {  	case MODULE_STATE_COMING: @@ -1772,6 +1914,7 @@ static int trace_module_notify(struct notifier_block *self,  		break;  	}  	mutex_unlock(&event_mutex); +	mutex_unlock(&trace_types_lock);  	return 0;  } @@ -2011,10 +2154,7 @@ event_enable_func(struct ftrace_hash *hash,  	int ret;  	/* hash funcs only work with set_ftrace_filter */ -	if (!enabled) -		return -EINVAL; - -	if (!param) +	if (!enabled || !param)  		return -EINVAL;  	system = strsep(¶m, ":"); @@ -2188,12 +2328,8 @@ __trace_remove_event_dirs(struct trace_array *tr)  {  	struct ftrace_event_file *file, *next; -	list_for_each_entry_safe(file, next, &tr->events, list) { -		list_del(&file->list); -		debugfs_remove_recursive(file->dir); -		remove_subsystem(file->system); -		kmem_cache_free(file_cachep, file); -	} +	list_for_each_entry_safe(file, next, &tr->events, list) +		remove_event_file_dir(file);  }  static void @@ -2329,11 +2465,11 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)  int event_trace_del_tracer(struct trace_array *tr)  { -	/* Disable any running events */ -	__ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); -  	mutex_lock(&event_mutex); +	/* Disable any running events */ +	__ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0); +  	down_write(&trace_event_sem);  	__trace_remove_event_dirs(tr);  	debugfs_remove_recursive(tr->event_dir); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e1b653f7e1ca..97daa8cf958d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -44,6 +44,7 @@ enum filter_op_ids  	OP_LE,  	OP_GT,  	OP_GE, +	OP_BAND,  	OP_NONE,  	OP_OPEN_PAREN,  }; @@ -54,6 +55,7 @@ struct filter_op {  	int precedence;  }; +/* Order must be the same as enum filter_op_ids above */  static struct filter_op filter_ops[] = {  	{ OP_OR,	"||",		1 },  	{ OP_AND,	"&&",		2 }, @@ -64,6 +66,7 @@ static struct filter_op filter_ops[] = {  	{ OP_LE,	"<=",		5 },  	{ OP_GT,	">",		5 },  	{ OP_GE,	">=",		5 }, +	{ OP_BAND,	"&",		6 },  	{ OP_NONE,	"OP_NONE",	0 },  	{ OP_OPEN_PAREN, "(",		0 },  }; @@ -156,6 +159,9 @@ static int filter_pred_##type(struct filter_pred *pred, void *event)	\  	case OP_GE:							\  		match = (*addr >= val);					\  		break;							\ +	case OP_BAND:							\ +		match = (*addr & val);					\ +		break;							\  	default:							\  		break;							\  	}								\ @@ -631,17 +637,15 @@ static void append_filter_err(struct filter_parse_state *ps,  	free_page((unsigned long) buf);  } +/* caller must hold event_mutex */  void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)  { -	struct event_filter *filter; +	struct event_filter *filter = call->filter; -	mutex_lock(&event_mutex); -	filter = call->filter;  	if (filter && filter->filter_string)  		trace_seq_printf(s, "%s\n", filter->filter_string);  	else -		trace_seq_printf(s, "none\n"); -	mutex_unlock(&event_mutex); +		trace_seq_puts(s, "none\n");  }  void print_subsystem_event_filter(struct event_subsystem *system, @@ -654,7 +658,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,  	if (filter && filter->filter_string)  		trace_seq_printf(s, "%s\n", filter->filter_string);  	else -		trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); +		trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n");  	mutex_unlock(&event_mutex);  } @@ -1835,23 +1839,22 @@ static int create_system_filter(struct event_subsystem *system,  	return err;  } +/* caller must hold event_mutex */  int apply_event_filter(struct ftrace_event_call *call, char *filter_string)  {  	struct event_filter *filter; -	int err = 0; - -	mutex_lock(&event_mutex); +	int err;  	if (!strcmp(strstrip(filter_string), "0")) {  		filter_disable(call);  		filter = call->filter;  		if (!filter) -			goto out_unlock; +			return 0;  		RCU_INIT_POINTER(call->filter, NULL);  		/* Make sure the filter is not being used */  		synchronize_sched();  		__free_filter(filter); -		goto out_unlock; +		return 0;  	}  	err = create_filter(call, filter_string, true, &filter); @@ -1878,8 +1881,6 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)  			__free_filter(tmp);  		}  	} -out_unlock: -	mutex_unlock(&event_mutex);  	return err;  } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c4d6d7191988..38fe1483c508 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -199,7 +199,7 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)  	return 0;  } -static struct tracer function_trace __read_mostly = +static struct tracer function_trace __tracer_data =  {  	.name		= "function",  	.init		= function_trace_init, @@ -290,6 +290,21 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)  		trace_dump_stack(STACK_SKIP);  } +static void +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (update_count(data)) +		ftrace_dump(DUMP_ALL); +} + +/* Only dump the current CPU buffer. */ +static void +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +{ +	if (update_count(data)) +		ftrace_dump(DUMP_ORIG); +} +  static int  ftrace_probe_print(const char *name, struct seq_file *m,  		   unsigned long ip, void *data) @@ -327,6 +342,20 @@ ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,  	return ftrace_probe_print("stacktrace", m, ip, data);  } +static int +ftrace_dump_print(struct seq_file *m, unsigned long ip, +			struct ftrace_probe_ops *ops, void *data) +{ +	return ftrace_probe_print("dump", m, ip, data); +} + +static int +ftrace_cpudump_print(struct seq_file *m, unsigned long ip, +			struct ftrace_probe_ops *ops, void *data) +{ +	return ftrace_probe_print("cpudump", m, ip, data); +} +  static struct ftrace_probe_ops traceon_count_probe_ops = {  	.func			= ftrace_traceon_count,  	.print			= ftrace_traceon_print, @@ -342,6 +371,16 @@ static struct ftrace_probe_ops stacktrace_count_probe_ops = {  	.print			= ftrace_stacktrace_print,  }; +static struct ftrace_probe_ops dump_probe_ops = { +	.func			= ftrace_dump_probe, +	.print			= ftrace_dump_print, +}; + +static struct ftrace_probe_ops cpudump_probe_ops = { +	.func			= ftrace_cpudump_probe, +	.print			= ftrace_cpudump_print, +}; +  static struct ftrace_probe_ops traceon_probe_ops = {  	.func			= ftrace_traceon,  	.print			= ftrace_traceon_print, @@ -425,6 +464,32 @@ ftrace_stacktrace_callback(struct ftrace_hash *hash,  					   param, enable);  } +static int +ftrace_dump_callback(struct ftrace_hash *hash, +			   char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; + +	ops = &dump_probe_ops; + +	/* Only dump once. */ +	return ftrace_trace_probe_callback(ops, hash, glob, cmd, +					   "1", enable); +} + +static int +ftrace_cpudump_callback(struct ftrace_hash *hash, +			   char *glob, char *cmd, char *param, int enable) +{ +	struct ftrace_probe_ops *ops; + +	ops = &cpudump_probe_ops; + +	/* Only dump once. */ +	return ftrace_trace_probe_callback(ops, hash, glob, cmd, +					   "1", enable); +} +  static struct ftrace_func_command ftrace_traceon_cmd = {  	.name			= "traceon",  	.func			= ftrace_trace_onoff_callback, @@ -440,6 +505,16 @@ static struct ftrace_func_command ftrace_stacktrace_cmd = {  	.func			= ftrace_stacktrace_callback,  }; +static struct ftrace_func_command ftrace_dump_cmd = { +	.name			= "dump", +	.func			= ftrace_dump_callback, +}; + +static struct ftrace_func_command ftrace_cpudump_cmd = { +	.name			= "cpudump", +	.func			= ftrace_cpudump_callback, +}; +  static int __init init_func_cmd_traceon(void)  {  	int ret; @@ -450,13 +525,31 @@ static int __init init_func_cmd_traceon(void)  	ret = register_ftrace_command(&ftrace_traceon_cmd);  	if (ret) -		unregister_ftrace_command(&ftrace_traceoff_cmd); +		goto out_free_traceoff;  	ret = register_ftrace_command(&ftrace_stacktrace_cmd); -	if (ret) { -		unregister_ftrace_command(&ftrace_traceoff_cmd); -		unregister_ftrace_command(&ftrace_traceon_cmd); -	} +	if (ret) +		goto out_free_traceon; + +	ret = register_ftrace_command(&ftrace_dump_cmd); +	if (ret) +		goto out_free_stacktrace; + +	ret = register_ftrace_command(&ftrace_cpudump_cmd); +	if (ret) +		goto out_free_dump; + +	return 0; + + out_free_dump: +	unregister_ftrace_command(&ftrace_dump_cmd); + out_free_stacktrace: +	unregister_ftrace_command(&ftrace_stacktrace_cmd); + out_free_traceon: +	unregister_ftrace_command(&ftrace_traceon_cmd); + out_free_traceoff: +	unregister_ftrace_command(&ftrace_traceoff_cmd); +  	return ret;  }  #else diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8388bc99f2ee..b5c09242683d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -446,7 +446,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)  	/* First spaces to align center */  	for (i = 0; i < spaces / 2; i++) { -		ret = trace_seq_printf(s, " "); +		ret = trace_seq_putc(s, ' ');  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -457,7 +457,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)  	/* Last spaces to align center */  	for (i = 0; i < spaces - (spaces / 2); i++) { -		ret = trace_seq_printf(s, " "); +		ret = trace_seq_putc(s, ' ');  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -503,7 +503,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)   ------------------------------------------   */ -	ret = trace_seq_printf(s, +	ret = trace_seq_puts(s,  		" ------------------------------------------\n");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -516,7 +516,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)  	if (ret == TRACE_TYPE_PARTIAL_LINE)  		return TRACE_TYPE_PARTIAL_LINE; -	ret = trace_seq_printf(s, " => "); +	ret = trace_seq_puts(s, " => ");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -524,7 +524,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)  	if (ret == TRACE_TYPE_PARTIAL_LINE)  		return TRACE_TYPE_PARTIAL_LINE; -	ret = trace_seq_printf(s, +	ret = trace_seq_puts(s,  		"\n ------------------------------------------\n\n");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -645,7 +645,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,  			ret = print_graph_proc(s, pid);  			if (ret == TRACE_TYPE_PARTIAL_LINE)  				return TRACE_TYPE_PARTIAL_LINE; -			ret = trace_seq_printf(s, " | "); +			ret = trace_seq_puts(s, " | ");  			if (!ret)  				return TRACE_TYPE_PARTIAL_LINE;  		} @@ -657,9 +657,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,  		return ret;  	if (type == TRACE_GRAPH_ENT) -		ret = trace_seq_printf(s, "==========>"); +		ret = trace_seq_puts(s, "==========>");  	else -		ret = trace_seq_printf(s, "<=========="); +		ret = trace_seq_puts(s, "<==========");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -668,7 +668,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,  	if (ret != TRACE_TYPE_HANDLED)  		return ret; -	ret = trace_seq_printf(s, "\n"); +	ret = trace_seq_putc(s, '\n');  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -705,13 +705,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)  		len += strlen(nsecs_str);  	} -	ret = trace_seq_printf(s, " us "); +	ret = trace_seq_puts(s, " us ");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE;  	/* Print remaining spaces to fit the row's width */  	for (i = len; i < 7; i++) { -		ret = trace_seq_printf(s, " "); +		ret = trace_seq_putc(s, ' ');  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -731,13 +731,13 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,  	/* No real adata, just filling the column with spaces */  	switch (duration) {  	case DURATION_FILL_FULL: -		ret = trace_seq_printf(s, "              |  "); +		ret = trace_seq_puts(s, "              |  ");  		return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;  	case DURATION_FILL_START: -		ret = trace_seq_printf(s, "  "); +		ret = trace_seq_puts(s, "  ");  		return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;  	case DURATION_FILL_END: -		ret = trace_seq_printf(s, " |"); +		ret = trace_seq_puts(s, " |");  		return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;  	} @@ -745,10 +745,10 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,  	if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {  		/* Duration exceeded 100 msecs */  		if (duration > 100000ULL) -			ret = trace_seq_printf(s, "! "); +			ret = trace_seq_puts(s, "! ");  		/* Duration exceeded 10 msecs */  		else if (duration > 10000ULL) -			ret = trace_seq_printf(s, "+ "); +			ret = trace_seq_puts(s, "+ ");  	}  	/* @@ -757,7 +757,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,  	 * to fill out the space.  	 */  	if (ret == -1) -		ret = trace_seq_printf(s, "  "); +		ret = trace_seq_puts(s, "  ");  	/* Catching here any failure happenned above */  	if (!ret) @@ -767,7 +767,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,  	if (ret != TRACE_TYPE_HANDLED)  		return ret; -	ret = trace_seq_printf(s, "|  "); +	ret = trace_seq_puts(s, "|  ");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -817,7 +817,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,  	/* Function */  	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { -		ret = trace_seq_printf(s, " "); +		ret = trace_seq_putc(s, ' ');  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -858,7 +858,7 @@ print_graph_entry_nested(struct trace_iterator *iter,  	/* Function */  	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { -		ret = trace_seq_printf(s, " "); +		ret = trace_seq_putc(s, ' ');  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -917,7 +917,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,  		if (ret == TRACE_TYPE_PARTIAL_LINE)  			return TRACE_TYPE_PARTIAL_LINE; -		ret = trace_seq_printf(s, " | "); +		ret = trace_seq_puts(s, " | ");  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -1117,7 +1117,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,  	/* Closing brace */  	for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { -		ret = trace_seq_printf(s, " "); +		ret = trace_seq_putc(s, ' ');  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} @@ -1129,7 +1129,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,  	 * belongs to, write out the function name.  	 */  	if (func_match) { -		ret = trace_seq_printf(s, "}\n"); +		ret = trace_seq_puts(s, "}\n");  		if (!ret)  			return TRACE_TYPE_PARTIAL_LINE;  	} else { @@ -1179,13 +1179,13 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,  	/* Indentation */  	if (depth > 0)  		for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { -			ret = trace_seq_printf(s, " "); +			ret = trace_seq_putc(s, ' ');  			if (!ret)  				return TRACE_TYPE_PARTIAL_LINE;  		}  	/* The comment */ -	ret = trace_seq_printf(s, "/* "); +	ret = trace_seq_puts(s, "/* ");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -1216,7 +1216,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,  		s->len--;  	} -	ret = trace_seq_printf(s, " */\n"); +	ret = trace_seq_puts(s, " */\n");  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -1448,7 +1448,7 @@ static struct trace_event graph_trace_ret_event = {  	.funcs		= &graph_functions  }; -static struct tracer graph_trace __read_mostly = { +static struct tracer graph_trace __tracer_data = {  	.name		= "function_graph",  	.open		= graph_trace_open,  	.pipe_open	= graph_trace_open, diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b19d065a28cb..2aefbee93a6d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -373,7 +373,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)  	struct trace_array_cpu *data;  	unsigned long flags; -	if (likely(!tracer_enabled)) +	if (!tracer_enabled || !tracing_is_enabled())  		return;  	cpu = raw_smp_processor_id(); @@ -416,7 +416,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)  	else  		return; -	if (!tracer_enabled) +	if (!tracer_enabled || !tracing_is_enabled())  		return;  	data = per_cpu_ptr(tr->trace_buffer.data, cpu); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9f46e98ba8f2..243f6834d026 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,12 +35,17 @@ struct trace_probe {  	const char		*symbol;	/* symbol name */  	struct ftrace_event_class	class;  	struct ftrace_event_call	call; -	struct ftrace_event_file * __rcu *files; +	struct list_head	files;  	ssize_t			size;		/* trace entry size */  	unsigned int		nr_args;  	struct probe_arg	args[];  }; +struct event_file_link { +	struct ftrace_event_file	*file; +	struct list_head		list; +}; +  #define SIZEOF_TRACE_PROBE(n)			\  	(offsetof(struct trace_probe, args) +	\  	(sizeof(struct probe_arg) * (n))) @@ -90,7 +95,7 @@ static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)  }  static int register_probe_event(struct trace_probe *tp); -static void unregister_probe_event(struct trace_probe *tp); +static int unregister_probe_event(struct trace_probe *tp);  static DEFINE_MUTEX(probe_lock);  static LIST_HEAD(probe_list); @@ -150,6 +155,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,  		goto error;  	INIT_LIST_HEAD(&tp->list); +	INIT_LIST_HEAD(&tp->files);  	return tp;  error:  	kfree(tp->call.name); @@ -183,25 +189,6 @@ static struct trace_probe *find_trace_probe(const char *event,  	return NULL;  } -static int trace_probe_nr_files(struct trace_probe *tp) -{ -	struct ftrace_event_file **file; -	int ret = 0; - -	/* -	 * Since all tp->files updater is protected by probe_enable_lock, -	 * we don't need to lock an rcu_read_lock. -	 */ -	file = rcu_dereference_raw(tp->files); -	if (file) -		while (*(file++)) -			ret++; - -	return ret; -} - -static DEFINE_MUTEX(probe_enable_lock); -  /*   * Enable trace_probe   * if the file is NULL, enable "perf" handler, or enable "trace" handler. @@ -211,67 +198,42 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)  {  	int ret = 0; -	mutex_lock(&probe_enable_lock); -  	if (file) { -		struct ftrace_event_file **new, **old; -		int n = trace_probe_nr_files(tp); - -		old = rcu_dereference_raw(tp->files); -		/* 1 is for new one and 1 is for stopper */ -		new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *), -			      GFP_KERNEL); -		if (!new) { +		struct event_file_link *link; + +		link = kmalloc(sizeof(*link), GFP_KERNEL); +		if (!link) {  			ret = -ENOMEM; -			goto out_unlock; +			goto out;  		} -		memcpy(new, old, n * sizeof(struct ftrace_event_file *)); -		new[n] = file; -		/* The last one keeps a NULL */ -		rcu_assign_pointer(tp->files, new); -		tp->flags |= TP_FLAG_TRACE; +		link->file = file; +		list_add_tail_rcu(&link->list, &tp->files); -		if (old) { -			/* Make sure the probe is done with old files */ -			synchronize_sched(); -			kfree(old); -		} +		tp->flags |= TP_FLAG_TRACE;  	} else  		tp->flags |= TP_FLAG_PROFILE; -	if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && -	    !trace_probe_has_gone(tp)) { +	if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) {  		if (trace_probe_is_return(tp))  			ret = enable_kretprobe(&tp->rp);  		else  			ret = enable_kprobe(&tp->rp.kp);  	} - - out_unlock: -	mutex_unlock(&probe_enable_lock); - + out:  	return ret;  } -static int -trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) +static struct event_file_link * +find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)  { -	struct ftrace_event_file **files; -	int i; +	struct event_file_link *link; -	/* -	 * Since all tp->files updater is protected by probe_enable_lock, -	 * we don't need to lock an rcu_read_lock. -	 */ -	files = rcu_dereference_raw(tp->files); -	if (files) { -		for (i = 0; files[i]; i++) -			if (files[i] == file) -				return i; -	} +	list_for_each_entry(link, &tp->files, list) +		if (link->file == file) +			return link; -	return -1; +	return NULL;  }  /* @@ -281,43 +243,23 @@ trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file)  static int  disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)  { +	struct event_file_link *link = NULL; +	int wait = 0;  	int ret = 0; -	mutex_lock(&probe_enable_lock); -  	if (file) { -		struct ftrace_event_file **new, **old; -		int n = trace_probe_nr_files(tp); -		int i, j; - -		old = rcu_dereference_raw(tp->files); -		if (n == 0 || trace_probe_file_index(tp, file) < 0) { +		link = find_event_file_link(tp, file); +		if (!link) {  			ret = -EINVAL; -			goto out_unlock; +			goto out;  		} -		if (n == 1) {	/* Remove the last file */ -			tp->flags &= ~TP_FLAG_TRACE; -			new = NULL; -		} else { -			new = kzalloc(n * sizeof(struct ftrace_event_file *), -				      GFP_KERNEL); -			if (!new) { -				ret = -ENOMEM; -				goto out_unlock; -			} - -			/* This copy & check loop copies the NULL stopper too */ -			for (i = 0, j = 0; j < n && i < n + 1; i++) -				if (old[i] != file) -					new[j++] = old[i]; -		} - -		rcu_assign_pointer(tp->files, new); +		list_del_rcu(&link->list); +		wait = 1; +		if (!list_empty(&tp->files)) +			goto out; -		/* Make sure the probe is done with old files */ -		synchronize_sched(); -		kfree(old); +		tp->flags &= ~TP_FLAG_TRACE;  	} else  		tp->flags &= ~TP_FLAG_PROFILE; @@ -326,10 +268,21 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)  			disable_kretprobe(&tp->rp);  		else  			disable_kprobe(&tp->rp.kp); +		wait = 1; +	} + out: +	if (wait) { +		/* +		 * Synchronize with kprobe_trace_func/kretprobe_trace_func +		 * to ensure disabled (all running handlers are finished). +		 * This is not only for kfree(), but also the caller, +		 * trace_remove_event_call() supposes it for releasing +		 * event_call related objects, which will be accessed in +		 * the kprobe_trace_func/kretprobe_trace_func. +		 */ +		synchronize_sched(); +		kfree(link);	/* Ignored if link == NULL */  	} - - out_unlock: -	mutex_unlock(&probe_enable_lock);  	return ret;  } @@ -398,9 +351,12 @@ static int unregister_trace_probe(struct trace_probe *tp)  	if (trace_probe_is_enabled(tp))  		return -EBUSY; +	/* Will fail if probe is being used by ftrace or perf */ +	if (unregister_probe_event(tp)) +		return -EBUSY; +  	__unregister_trace_probe(tp);  	list_del(&tp->list); -	unregister_probe_event(tp);  	return 0;  } @@ -679,7 +635,9 @@ static int release_all_trace_probes(void)  	/* TODO: Use batch unregistration */  	while (!list_empty(&probe_list)) {  		tp = list_entry(probe_list.next, struct trace_probe, list); -		unregister_trace_probe(tp); +		ret = unregister_trace_probe(tp); +		if (ret) +			goto end;  		free_trace_probe(tp);  	} @@ -885,20 +843,10 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,  static __kprobes void  kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)  { -	/* -	 * Note: preempt is already disabled around the kprobe handler. -	 * However, we still need an smp_read_barrier_depends() corresponding -	 * to smp_wmb() in rcu_assign_pointer() to access the pointer. -	 */ -	struct ftrace_event_file **file = rcu_dereference_raw(tp->files); - -	if (unlikely(!file)) -		return; +	struct event_file_link *link; -	while (*file) { -		__kprobe_trace_func(tp, regs, *file); -		file++; -	} +	list_for_each_entry_rcu(link, &tp->files, list) +		__kprobe_trace_func(tp, regs, link->file);  }  /* Kretprobe handler */ @@ -945,20 +893,10 @@ static __kprobes void  kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,  		     struct pt_regs *regs)  { -	/* -	 * Note: preempt is already disabled around the kprobe handler. -	 * However, we still need an smp_read_barrier_depends() corresponding -	 * to smp_wmb() in rcu_assign_pointer() to access the pointer. -	 */ -	struct ftrace_event_file **file = rcu_dereference_raw(tp->files); - -	if (unlikely(!file)) -		return; +	struct event_file_link *link; -	while (*file) { -		__kretprobe_trace_func(tp, ri, regs, *file); -		file++; -	} +	list_for_each_entry_rcu(link, &tp->files, list) +		__kretprobe_trace_func(tp, ri, regs, link->file);  }  /* Event entry printers */ @@ -1157,13 +1095,14 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)  	int size, __size, dsize;  	int rctx; +	head = this_cpu_ptr(call->perf_events); +	if (hlist_empty(head)) +		return; +  	dsize = __get_data_size(tp, regs);  	__size = sizeof(*entry) + tp->size + dsize;  	size = ALIGN(__size + sizeof(u32), sizeof(u64));  	size -= sizeof(u32); -	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, -		     "profile buffer not large enough")) -		return;  	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);  	if (!entry) @@ -1172,10 +1111,7 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)  	entry->ip = (unsigned long)tp->rp.kp.addr;  	memset(&entry[1], 0, dsize);  	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - -	head = this_cpu_ptr(call->perf_events); -	perf_trace_buf_submit(entry, size, rctx, -					entry->ip, 1, regs, head, NULL); +	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);  }  /* Kretprobe profile handler */ @@ -1189,13 +1125,14 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,  	int size, __size, dsize;  	int rctx; +	head = this_cpu_ptr(call->perf_events); +	if (hlist_empty(head)) +		return; +  	dsize = __get_data_size(tp, regs);  	__size = sizeof(*entry) + tp->size + dsize;  	size = ALIGN(__size + sizeof(u32), sizeof(u64));  	size -= sizeof(u32); -	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, -		     "profile buffer not large enough")) -		return;  	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);  	if (!entry) @@ -1204,13 +1141,16 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,  	entry->func = (unsigned long)tp->rp.kp.addr;  	entry->ret_ip = (unsigned long)ri->ret_addr;  	store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - -	head = this_cpu_ptr(call->perf_events); -	perf_trace_buf_submit(entry, size, rctx, -					entry->ret_ip, 1, regs, head, NULL); +	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);  }  #endif	/* CONFIG_PERF_EVENTS */ +/* + * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. + * + * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe + * lockless, but we can't race with this __init function. + */  static __kprobes  int kprobe_register(struct ftrace_event_call *event,  		    enum trace_reg type, void *data) @@ -1312,11 +1252,15 @@ static int register_probe_event(struct trace_probe *tp)  	return ret;  } -static void unregister_probe_event(struct trace_probe *tp) +static int unregister_probe_event(struct trace_probe *tp)  { +	int ret; +  	/* tp->event is unregistered in trace_remove_event_call() */ -	trace_remove_event_call(&tp->call); -	kfree(tp->call.print_fmt); +	ret = trace_remove_event_call(&tp->call); +	if (!ret) +		kfree(tp->call.print_fmt); +	return ret;  }  /* Make a debugfs interface for controlling probe points */ @@ -1376,6 +1320,10 @@ find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr)  	return NULL;  } +/* + * Nobody but us can call enable_trace_probe/disable_trace_probe at this + * stage, we can do this lockless. + */  static __init int kprobe_trace_self_tests_init(void)  {  	int ret, warn = 0; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index a5e8f4878bfa..b3dcfb2f0fef 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -90,7 +90,7 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)  	if (drv)  		ret += trace_seq_printf(s, " %s\n", drv->name);  	else -		ret += trace_seq_printf(s, " \n"); +		ret += trace_seq_puts(s, " \n");  	return ret;  } @@ -107,7 +107,7 @@ static void mmio_pipe_open(struct trace_iterator *iter)  	struct header_iter *hiter;  	struct trace_seq *s = &iter->seq; -	trace_seq_printf(s, "VERSION 20070824\n"); +	trace_seq_puts(s, "VERSION 20070824\n");  	hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);  	if (!hiter) @@ -209,7 +209,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)  			(rw->value >> 0) & 0xff, rw->pc, 0);  		break;  	default: -		ret = trace_seq_printf(s, "rw what?\n"); +		ret = trace_seq_puts(s, "rw what?\n");  		break;  	}  	if (ret) @@ -245,7 +245,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)  			secs, usec_rem, m->map_id, 0UL, 0);  		break;  	default: -		ret = trace_seq_printf(s, "map what?\n"); +		ret = trace_seq_puts(s, "map what?\n");  		break;  	}  	if (ret) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index bb922d9ee51b..34e7cbac0c9c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -78,7 +78,7 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)  	trace_assign_type(field, entry); -	ret = trace_seq_printf(s, "%s", field->buf); +	ret = trace_seq_puts(s, field->buf);  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -558,14 +558,14 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,  			if (ret)  				ret = trace_seq_puts(s, "??");  			if (ret) -				ret = trace_seq_puts(s, "\n"); +				ret = trace_seq_putc(s, '\n');  			continue;  		}  		if (!ret)  			break;  		if (ret)  			ret = seq_print_user_ip(s, mm, ip, sym_flags); -		ret = trace_seq_puts(s, "\n"); +		ret = trace_seq_putc(s, '\n');  	}  	if (mm) @@ -579,7 +579,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)  	int ret;  	if (!ip) -		return trace_seq_printf(s, "0"); +		return trace_seq_putc(s, '0');  	if (sym_flags & TRACE_ITER_SYM_OFFSET)  		ret = seq_print_sym_offset(s, "%s", ip); @@ -964,14 +964,14 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,  		goto partial;  	if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { -		if (!trace_seq_printf(s, " <-")) +		if (!trace_seq_puts(s, " <-"))  			goto partial;  		if (!seq_print_ip_sym(s,  				      field->parent_ip,  				      flags))  			goto partial;  	} -	if (!trace_seq_printf(s, "\n")) +	if (!trace_seq_putc(s, '\n'))  		goto partial;  	return TRACE_TYPE_HANDLED; @@ -1210,7 +1210,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,  		if (!seq_print_ip_sym(s, *p, flags))  			goto partial; -		if (!trace_seq_puts(s, "\n")) +		if (!trace_seq_putc(s, '\n'))  			goto partial;  	} diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 2901e3b88590..a7329b7902f8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -640,13 +640,20 @@ out:   * Enable ftrace, sleep 1/10 second, and then read the trace   * buffer to see if all is in order.   */ -int +__init int  trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)  {  	int save_ftrace_enabled = ftrace_enabled;  	unsigned long count;  	int ret; +#ifdef CONFIG_DYNAMIC_FTRACE +	if (ftrace_filter_param) { +		printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); +		return 0; +	} +#endif +  	/* make sure msleep has been recorded */  	msleep(1); @@ -727,13 +734,20 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)   * Pretty much the same than for the function tracer from which the selftest   * has been borrowed.   */ -int +__init int  trace_selftest_startup_function_graph(struct tracer *trace,  					struct trace_array *tr)  {  	int ret;  	unsigned long count; +#ifdef CONFIG_DYNAMIC_FTRACE +	if (ftrace_filter_param) { +		printk(KERN_CONT " ... kernel command line filter set: force PASS ... "); +		return 0; +	} +#endif +  	/*  	 * Simulate the init() callback but we attach a watchdog callback  	 * to detect and recover from possible hangs diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8f2ac73c7a5f..8fd03657bc7d 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -175,7 +175,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,  	entry = syscall_nr_to_meta(syscall);  	if (!entry) { -		trace_seq_printf(s, "\n"); +		trace_seq_putc(s, '\n');  		return TRACE_TYPE_HANDLED;  	} @@ -306,6 +306,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer; +	unsigned long irq_flags; +	int pc;  	int syscall_nr;  	int size; @@ -321,9 +323,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; +	local_save_flags(irq_flags); +	pc = preempt_count(); +  	buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, -			sys_data->enter_event->event.type, size, 0, 0); +			sys_data->enter_event->event.type, size, irq_flags, pc);  	if (!event)  		return; @@ -333,7 +338,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)  	if (!filter_current_check_discard(buffer, sys_data->enter_event,  					  entry, event)) -		trace_current_buffer_unlock_commit(buffer, event, 0, 0); +		trace_current_buffer_unlock_commit(buffer, event, +						   irq_flags, pc);  }  static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) @@ -343,6 +349,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)  	struct syscall_metadata *sys_data;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer; +	unsigned long irq_flags; +	int pc;  	int syscall_nr;  	syscall_nr = trace_get_syscall_nr(current, regs); @@ -355,9 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)  	if (!sys_data)  		return; +	local_save_flags(irq_flags); +	pc = preempt_count(); +  	buffer = tr->trace_buffer.buffer;  	event = trace_buffer_lock_reserve(buffer, -			sys_data->exit_event->event.type, sizeof(*entry), 0, 0); +			sys_data->exit_event->event.type, sizeof(*entry), +			irq_flags, pc);  	if (!event)  		return; @@ -367,7 +379,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)  	if (!filter_current_check_discard(buffer, sys_data->exit_event,  					  entry, event)) -		trace_current_buffer_unlock_commit(buffer, event, 0, 0); +		trace_current_buffer_unlock_commit(buffer, event, +						   irq_flags, pc);  }  static int reg_event_syscall_enter(struct ftrace_event_file *file, @@ -553,15 +566,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	if (!sys_data)  		return; +	head = this_cpu_ptr(sys_data->enter_event->perf_events); +	if (hlist_empty(head)) +		return; +  	/* get the size after alignment with the u32 buffer size field */  	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);  	size = ALIGN(size + sizeof(u32), sizeof(u64));  	size -= sizeof(u32); -	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, -		      "perf buffer not large enough")) -		return; -  	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,  				sys_data->enter_event->event.type, regs, &rctx);  	if (!rec) @@ -570,8 +583,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)  	rec->nr = syscall_nr;  	syscall_get_arguments(current, regs, 0, sys_data->nb_args,  			       (unsigned long *)&rec->args); - -	head = this_cpu_ptr(sys_data->enter_event->perf_events);  	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);  } @@ -629,18 +640,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  	if (!sys_data)  		return; +	head = this_cpu_ptr(sys_data->exit_event->perf_events); +	if (hlist_empty(head)) +		return; +  	/* We can probably do that at build time */  	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));  	size -= sizeof(u32); -	/* -	 * Impossible, but be paranoid with the future -	 * How to put this check outside runtime? -	 */ -	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, -		"exit event has grown above perf buffer size")) -		return; -  	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,  				sys_data->exit_event->event.type, regs, &rctx);  	if (!rec) @@ -648,8 +655,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)  	rec->nr = syscall_nr;  	rec->ret = syscall_get_return_value(current, regs); - -	head = this_cpu_ptr(sys_data->exit_event->perf_events);  	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);  } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 32494fb0ee64..272261b5f94f 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -70,7 +70,7 @@ struct trace_uprobe {  	(sizeof(struct probe_arg) * (n)))  static int register_uprobe_event(struct trace_uprobe *tu); -static void unregister_uprobe_event(struct trace_uprobe *tu); +static int unregister_uprobe_event(struct trace_uprobe *tu);  static DEFINE_MUTEX(uprobe_lock);  static LIST_HEAD(uprobe_list); @@ -164,11 +164,17 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou  }  /* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */ -static void unregister_trace_uprobe(struct trace_uprobe *tu) +static int unregister_trace_uprobe(struct trace_uprobe *tu)  { +	int ret; + +	ret = unregister_uprobe_event(tu); +	if (ret) +		return ret; +  	list_del(&tu->list); -	unregister_uprobe_event(tu);  	free_trace_uprobe(tu); +	return 0;  }  /* Register a trace_uprobe and probe_event */ @@ -181,9 +187,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu)  	/* register as an event */  	old_tp = find_probe_event(tu->call.name, tu->call.class->system); -	if (old_tp) +	if (old_tp) {  		/* delete old event */ -		unregister_trace_uprobe(old_tp); +		ret = unregister_trace_uprobe(old_tp); +		if (ret) +			goto end; +	}  	ret = register_uprobe_event(tu);  	if (ret) { @@ -256,6 +265,8 @@ static int create_trace_uprobe(int argc, char **argv)  		group = UPROBE_EVENT_SYSTEM;  	if (is_delete) { +		int ret; +  		if (!event) {  			pr_info("Delete command needs an event name.\n");  			return -EINVAL; @@ -269,9 +280,9 @@ static int create_trace_uprobe(int argc, char **argv)  			return -ENOENT;  		}  		/* delete an event */ -		unregister_trace_uprobe(tu); +		ret = unregister_trace_uprobe(tu);  		mutex_unlock(&uprobe_lock); -		return 0; +		return ret;  	}  	if (argc < 2) { @@ -283,8 +294,10 @@ static int create_trace_uprobe(int argc, char **argv)  		return -EINVAL;  	}  	arg = strchr(argv[1], ':'); -	if (!arg) +	if (!arg) { +		ret = -EINVAL;  		goto fail_address_parse; +	}  	*arg++ = '\0';  	filename = argv[1]; @@ -406,16 +419,20 @@ fail_address_parse:  	return ret;  } -static void cleanup_all_probes(void) +static int cleanup_all_probes(void)  {  	struct trace_uprobe *tu; +	int ret = 0;  	mutex_lock(&uprobe_lock);  	while (!list_empty(&uprobe_list)) {  		tu = list_entry(uprobe_list.next, struct trace_uprobe, list); -		unregister_trace_uprobe(tu); +		ret = unregister_trace_uprobe(tu); +		if (ret) +			break;  	}  	mutex_unlock(&uprobe_lock); +	return ret;  }  /* Probes listing interfaces */ @@ -460,8 +477,13 @@ static const struct seq_operations probes_seq_op = {  static int probes_open(struct inode *inode, struct file *file)  { -	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) -		cleanup_all_probes(); +	int ret; + +	if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { +		ret = cleanup_all_probes(); +		if (ret) +			return ret; +	}  	return seq_open(file, &probes_seq_op);  } @@ -816,8 +838,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu,  	size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));  	size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); -	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) -		return;  	preempt_disable();  	head = this_cpu_ptr(call->perf_events); @@ -968,12 +988,17 @@ static int register_uprobe_event(struct trace_uprobe *tu)  	return ret;  } -static void unregister_uprobe_event(struct trace_uprobe *tu) +static int unregister_uprobe_event(struct trace_uprobe *tu)  { +	int ret; +  	/* tu->event is unregistered in trace_remove_event_call() */ -	trace_remove_event_call(&tu->call); +	ret = trace_remove_event_call(&tu->call); +	if (ret) +		return ret;  	kfree(tu->call.print_fmt);  	tu->call.print_fmt = NULL; +	return 0;  }  /* Make a trace interface for controling probe points */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index d8c30db06c5b..9064b919a406 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -62,6 +62,9 @@ int create_user_ns(struct cred *new)  	kgid_t group = new->egid;  	int ret; +	if (parent_ns->level > 32) +		return -EUSERS; +  	/*  	 * Verify that we can not violate the policy of which files  	 * may be accessed that is specified by the root directory, @@ -92,6 +95,7 @@ int create_user_ns(struct cred *new)  	atomic_set(&ns->count, 1);  	/* Leave the new->user_ns reference with the new user namespace. */  	ns->parent = parent_ns; +	ns->level = parent_ns->level + 1;  	ns->owner = owner;  	ns->group = group; @@ -105,16 +109,21 @@ int create_user_ns(struct cred *new)  int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)  {  	struct cred *cred; +	int err = -ENOMEM;  	if (!(unshare_flags & CLONE_NEWUSER))  		return 0;  	cred = prepare_creds(); -	if (!cred) -		return -ENOMEM; +	if (cred) { +		err = create_user_ns(cred); +		if (err) +			put_cred(cred); +		else +			*new_cred = cred; +	} -	*new_cred = cred; -	return create_user_ns(cred); +	return err;  }  void free_user_ns(struct user_namespace *ns) diff --git a/kernel/wait.c b/kernel/wait.c index ce0daa320a26..dec68bd4e9d8 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -333,7 +333,8 @@ int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,  		prepare_to_wait(wq, &q->wait, mode);  		val = q->key.flags;  		if (atomic_read(val) == 0) -			ret = (*action)(val); +			break; +		ret = (*action)(val);  	} while (!ret && atomic_read(val) != 0);  	finish_wait(wq, &q->wait);  	return ret; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 05039e348f07..1241d8c91d5e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,9 +29,9 @@  #include <linux/kvm_para.h>  #include <linux/perf_event.h> -int watchdog_enabled = 1; +int watchdog_user_enabled = 1;  int __read_mostly watchdog_thresh = 10; -static int __read_mostly watchdog_disabled; +static int __read_mostly watchdog_running;  static u64 __read_mostly sample_period;  static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -63,7 +63,7 @@ static int __init hardlockup_panic_setup(char *str)  	else if (!strncmp(str, "nopanic", 7))  		hardlockup_panic = 0;  	else if (!strncmp(str, "0", 1)) -		watchdog_enabled = 0; +		watchdog_user_enabled = 0;  	return 1;  }  __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -82,7 +82,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);  static int __init nowatchdog_setup(char *str)  { -	watchdog_enabled = 0; +	watchdog_user_enabled = 0;  	return 1;  }  __setup("nowatchdog", nowatchdog_setup); @@ -90,7 +90,7 @@ __setup("nowatchdog", nowatchdog_setup);  /* deprecated */  static int __init nosoftlockup_setup(char *str)  { -	watchdog_enabled = 0; +	watchdog_user_enabled = 0;  	return 1;  }  __setup("nosoftlockup", nosoftlockup_setup); @@ -158,7 +158,7 @@ void touch_all_softlockup_watchdogs(void)  #ifdef CONFIG_HARDLOCKUP_DETECTOR  void touch_nmi_watchdog(void)  { -	if (watchdog_enabled) { +	if (watchdog_user_enabled) {  		unsigned cpu;  		for_each_present_cpu(cpu) { @@ -347,11 +347,6 @@ static void watchdog_enable(unsigned int cpu)  	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);  	hrtimer->function = watchdog_timer_fn; -	if (!watchdog_enabled) { -		kthread_park(current); -		return; -	} -  	/* Enable the perf event */  	watchdog_nmi_enable(cpu); @@ -374,6 +369,11 @@ static void watchdog_disable(unsigned int cpu)  	watchdog_nmi_disable(cpu);  } +static void watchdog_cleanup(unsigned int cpu, bool online) +{ +	watchdog_disable(cpu); +} +  static int watchdog_should_run(unsigned int cpu)  {  	return __this_cpu_read(hrtimer_interrupts) != @@ -475,28 +475,40 @@ static int watchdog_nmi_enable(unsigned int cpu) { return 0; }  static void watchdog_nmi_disable(unsigned int cpu) { return; }  #endif /* CONFIG_HARDLOCKUP_DETECTOR */ -/* prepare/enable/disable routines */ -/* sysctl functions */ -#ifdef CONFIG_SYSCTL -static void watchdog_enable_all_cpus(void) +static struct smp_hotplug_thread watchdog_threads = { +	.store			= &softlockup_watchdog, +	.thread_should_run	= watchdog_should_run, +	.thread_fn		= watchdog, +	.thread_comm		= "watchdog/%u", +	.setup			= watchdog_enable, +	.cleanup		= watchdog_cleanup, +	.park			= watchdog_disable, +	.unpark			= watchdog_enable, +}; + +static int watchdog_enable_all_cpus(void)  { -	unsigned int cpu; +	int err = 0; -	if (watchdog_disabled) { -		watchdog_disabled = 0; -		for_each_online_cpu(cpu) -			kthread_unpark(per_cpu(softlockup_watchdog, cpu)); +	if (!watchdog_running) { +		err = smpboot_register_percpu_thread(&watchdog_threads); +		if (err) +			pr_err("Failed to create watchdog threads, disabled\n"); +		else +			watchdog_running = 1;  	} + +	return err;  } +/* prepare/enable/disable routines */ +/* sysctl functions */ +#ifdef CONFIG_SYSCTL  static void watchdog_disable_all_cpus(void)  { -	unsigned int cpu; - -	if (!watchdog_disabled) { -		watchdog_disabled = 1; -		for_each_online_cpu(cpu) -			kthread_park(per_cpu(softlockup_watchdog, cpu)); +	if (watchdog_running) { +		watchdog_running = 0; +		smpboot_unregister_percpu_thread(&watchdog_threads);  	}  } @@ -507,45 +519,48 @@ static void watchdog_disable_all_cpus(void)  int proc_dowatchdog(struct ctl_table *table, int write,  		    void __user *buffer, size_t *lenp, loff_t *ppos)  { -	int ret; +	int err, old_thresh, old_enabled; -	if (watchdog_disabled < 0) -		return -ENODEV; +	old_thresh = ACCESS_ONCE(watchdog_thresh); +	old_enabled = ACCESS_ONCE(watchdog_user_enabled); -	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); -	if (ret || !write) -		return ret; +	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); +	if (err || !write) +		return err;  	set_sample_period();  	/*  	 * Watchdog threads shouldn't be enabled if they are -	 * disabled. The 'watchdog_disabled' variable check in +	 * disabled. The 'watchdog_running' variable check in  	 * watchdog_*_all_cpus() function takes care of this.  	 */ -	if (watchdog_enabled && watchdog_thresh) -		watchdog_enable_all_cpus(); +	if (watchdog_user_enabled && watchdog_thresh) +		err = watchdog_enable_all_cpus();  	else  		watchdog_disable_all_cpus(); -	return ret; +	/* Restore old values on failure */ +	if (err) { +		watchdog_thresh = old_thresh; +		watchdog_user_enabled = old_enabled; +	} + +	return err;  }  #endif /* CONFIG_SYSCTL */ -static struct smp_hotplug_thread watchdog_threads = { -	.store			= &softlockup_watchdog, -	.thread_should_run	= watchdog_should_run, -	.thread_fn		= watchdog, -	.thread_comm		= "watchdog/%u", -	.setup			= watchdog_enable, -	.park			= watchdog_disable, -	.unpark			= watchdog_enable, -}; -  void __init lockup_detector_init(void)  {  	set_sample_period(); -	if (smpboot_register_percpu_thread(&watchdog_threads)) { -		pr_err("Failed to create watchdog threads, disabled\n"); -		watchdog_disabled = -ENODEV; + +#ifdef CONFIG_NO_HZ_FULL +	if (watchdog_user_enabled) { +		watchdog_user_enabled = 0; +		pr_warning("Disabled lockup detectors by default for full dynticks\n"); +		pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n");  	} +#endif + +	if (watchdog_user_enabled) +		watchdog_enable_all_cpus();  } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4a0c3c..7f5d4be22034 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2817,6 +2817,19 @@ already_gone:  	return false;  } +static bool __flush_work(struct work_struct *work) +{ +	struct wq_barrier barr; + +	if (start_flush_work(work, &barr)) { +		wait_for_completion(&barr.done); +		destroy_work_on_stack(&barr.work); +		return true; +	} else { +		return false; +	} +} +  /**   * flush_work - wait for a work to finish executing the last queueing instance   * @work: the work to flush @@ -2830,18 +2843,10 @@ already_gone:   */  bool flush_work(struct work_struct *work)  { -	struct wq_barrier barr; -  	lock_map_acquire(&work->lockdep_map);  	lock_map_release(&work->lockdep_map); -	if (start_flush_work(work, &barr)) { -		wait_for_completion(&barr.done); -		destroy_work_on_stack(&barr.work); -		return true; -	} else { -		return false; -	} +	return __flush_work(work);  }  EXPORT_SYMBOL_GPL(flush_work); @@ -3411,6 +3416,12 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,  {  	to->nice = from->nice;  	cpumask_copy(to->cpumask, from->cpumask); +	/* +	 * Unlike hash and equality test, this function doesn't ignore +	 * ->no_numa as it is used for both pool and wq attrs.  Instead, +	 * get_unbound_pool() explicitly clears ->no_numa after copying. +	 */ +	to->no_numa = from->no_numa;  }  /* hash value of the content of @attr */ @@ -3578,6 +3589,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)  	lockdep_set_subclass(&pool->lock, 1);	/* see put_pwq() */  	copy_workqueue_attrs(pool->attrs, attrs); +	/* +	 * no_numa isn't a worker_pool attribute, always clear it.  See +	 * 'struct workqueue_attrs' comments for detail. +	 */ +	pool->attrs->no_numa = false; +  	/* if cpumask is contained inside a NUMA node, we belong to that node */  	if (wq_numa_enabled) {  		for_each_node(node) { @@ -4644,7 +4661,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)   * Workqueues should be brought up before normal priority CPU notifiers.   * This will be registered high priority CPU notifier.   */ -static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, +static int workqueue_cpu_up_callback(struct notifier_block *nfb,  					       unsigned long action,  					       void *hcpu)  { @@ -4697,7 +4714,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,   * Workqueues should be brought down after normal priority CPU notifiers.   * This will be registered as low priority CPU notifier.   */ -static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, +static int workqueue_cpu_down_callback(struct notifier_block *nfb,  						 unsigned long action,  						 void *hcpu)  { @@ -4756,7 +4773,14 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)  	INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);  	schedule_work_on(cpu, &wfc.work); -	flush_work(&wfc.work); + +	/* +	 * The work item is on-stack and can't lead to deadlock through +	 * flushing.  Use __flush_work() to avoid spurious lockdep warnings +	 * when work_on_cpu()s are nested. +	 */ +	__flush_work(&wfc.work); +  	return wfc.ret;  }  EXPORT_SYMBOL_GPL(work_on_cpu);  | 
