From 72d0d248ca8232dbd30d35b42d0d86e39b3e322b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 16 Jul 2012 15:23:48 -0400 Subject: fuse: add FUSE_AUTO_INVAL_DATA init flag FUSE_AUTO_INVAL_DATA is provided to enable updated/auto cache invalidation logic. Signed-off-by: Brian Foster Signed-off-by: Miklos Szeredi --- include/linux/fuse.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 9303348965fb..e4a9d2af9aaa 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -57,6 +57,9 @@ * * 7.19 * - add FUSE_FALLOCATE + * + * 7.20 + * - add FUSE_AUTO_INVAL_DATA */ #ifndef _LINUX_FUSE_H @@ -88,7 +91,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 19 +#define FUSE_KERNEL_MINOR_VERSION 20 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -167,6 +170,7 @@ struct fuse_file_lock { * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".." * FUSE_DONT_MASK: don't apply umask to file mode on create operations * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks + * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -176,6 +180,7 @@ struct fuse_file_lock { #define FUSE_BIG_WRITES (1 << 5) #define FUSE_DONT_MASK (1 << 6) #define FUSE_FLOCK_LOCKS (1 << 10) +#define FUSE_AUTO_INVAL_DATA (1 << 12) /** * CUSE INIT request/reply flags -- cgit v1.3 From 69fe05c90ed58aac956dccb9e6d3a325fb3b8767 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2012 16:09:40 +0200 Subject: fuse: add missing INIT flags Add missing flags that userspace derived from the protocol version number. This makes the protocol more flexible. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 3 ++- include/linux/fuse.h | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index dd4401650b47..ce0a2838ccd0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -882,7 +882,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | - FUSE_FLOCK_LOCKS | FUSE_AUTO_INVAL_DATA; + FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | + FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/include/linux/fuse.h b/include/linux/fuse.h index e4a9d2af9aaa..6455c5b64c2e 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -169,7 +169,11 @@ struct fuse_file_lock { * FUSE_POSIX_LOCKS: remote locking for POSIX file locks * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".." * FUSE_DONT_MASK: don't apply umask to file mode on create operations + * FUSE_SPLICE_WRITE: kernel supports splice write on the device + * FUSE_SPLICE_MOVE: kernel supports splice move on the device + * FUSE_SPLICE_READ: kernel supports splice read on the device * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks + * FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories * FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages */ #define FUSE_ASYNC_READ (1 << 0) @@ -179,7 +183,11 @@ struct fuse_file_lock { #define FUSE_EXPORT_SUPPORT (1 << 4) #define FUSE_BIG_WRITES (1 << 5) #define FUSE_DONT_MASK (1 << 6) +#define FUSE_SPLICE_WRITE (1 << 7) +#define FUSE_SPLICE_MOVE (1 << 8) +#define FUSE_SPLICE_READ (1 << 9) #define FUSE_FLOCK_LOCKS (1 << 10) +#define FUSE_HAS_IOCTL_DIR (1 << 11) #define FUSE_AUTO_INVAL_DATA (1 << 12) /** -- cgit v1.3 From f3840dc0fb57aef120c5ee8241cdc9aaf3cec8d4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2012 16:09:40 +0200 Subject: fuse: add missing INIT flag descriptions Signed-off-by: Miklos Szeredi --- include/linux/fuse.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 6455c5b64c2e..d8c713e148e3 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -166,8 +166,12 @@ struct fuse_file_lock { /** * INIT request/reply flags * + * FUSE_ASYNC_READ: asynchronous read requests * FUSE_POSIX_LOCKS: remote locking for POSIX file locks + * FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported) + * FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".." + * FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB * FUSE_DONT_MASK: don't apply umask to file mode on create operations * FUSE_SPLICE_WRITE: kernel supports splice write on the device * FUSE_SPLICE_MOVE: kernel supports splice move on the device -- cgit v1.3 From dc9b229a58dc0dfed34272ff26c6d5fd17c674e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Jul 2012 19:29:45 +0200 Subject: genirq: Allow irq chips to mark themself oneshot safe Some interrupt chips like MSI are oneshot safe by implementation. For those interrupts we can avoid the mask/unmask sequence for threaded interrupt handlers. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1207132056540.32033@ionos Cc: Linus Torvalds Cc: Avi Kivity Cc: Marcelo Tosatti Cc: Jan Kiszka --- include/linux/irq.h | 1 + kernel/irq/manage.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 553fb66da130..216b0ba109d7 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -349,6 +349,7 @@ enum { IRQCHIP_MASK_ON_SUSPEND = (1 << 2), IRQCHIP_ONOFFLINE_ENABLED = (1 << 3), IRQCHIP_SKIP_SET_WAKE = (1 << 4), + IRQCHIP_ONESHOT_SAFE = (1 << 5), }; /* This include will go away once we isolated irq_desc usage to core code */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8c548232ba39..2e326d1ebec1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -959,6 +959,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto out_thread; } + /* + * Drivers are often written to work w/o knowledge about the + * underlying irq chip implementation, so a request for a + * threaded irq without a primary hard irq context handler + * requires the ONESHOT flag to be set. Some irq chips like + * MSI based interrupts are per se one shot safe. Check the + * chip flags, so we can avoid the unmask dance at the end of + * the threaded handler for those. + */ + if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE) + new->flags &= ~IRQF_ONESHOT; + /* * The following block of code has to be executed atomically */ @@ -1033,7 +1045,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ new->thread_mask = 1 << ffz(thread_mask); - } else if (new->handler == irq_default_primary_handler) { + } else if (new->handler == irq_default_primary_handler && + !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) { /* * The interrupt was requested with handler = NULL, so * we use the default primary handler for it. But it -- cgit v1.3 From 6956dc568f34107f1d02b24f87efe7250803fc87 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 20 Jul 2012 14:19:50 +0800 Subject: sched/numa: Add SD_PERFER_SIBLING to CPU domain Commit 8e7fbcbc22c ("sched: Remove stale power aware scheduling remnants and dysfunctional knobs") removed SD_PERFER_SIBLING from the CPU domain. On NUMA machines this causes that load_balance() doesn't perfer LCPU in same physical CPU package. It causes some actual performance regressions on our NUMA machines from Core2 to NHM and SNB. Adding this domain flag again recovers the performance drop. This change doesn't have any bad impact on any of my benchmarks: specjbb, kbuild, fio, hackbench .. etc, on all my machines. Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1342765190-21540-1-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar --- include/linux/topology.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index e91cd43394df..fec12d667211 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -164,6 +164,7 @@ int arch_update_cpu_topology(void); | 0*SD_SHARE_CPUPOWER \ | 0*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ + | 1*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ .balance_interval = 1, \ -- cgit v1.3 From a7e4786b937a3ae918a7520cfdba557a80915fa7 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 21 Jul 2012 00:54:59 +0530 Subject: sched: Fix comment about PREEMPT_ACTIVE bit location PREEMPT_ACTIVE flag is bit 27, not 28. Fix the comment. Signed-off-by: Srivatsa S. Bhat Cc: paulmck@linux.vnet.ibm.com Cc: josh@joshtriplett.org Cc: rostedt@goodmis.org Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120720192459.6149.14821.stgit@srivatsabhat.in.ibm.com Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index bb7f30971858..305f23cd7cff 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -22,7 +22,7 @@ * * - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024) * - bit 26 is the NMI_MASK - * - bit 28 is the PREEMPT_ACTIVE flag + * - bit 27 is the PREEMPT_ACTIVE flag * * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 -- cgit v1.3 From 0f26d0e0a715556270d85b7946b99546a2f92888 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 30 Jul 2012 22:44:41 -0500 Subject: kdb: Remove unused KDB_FLAG_ONLY_DO_DUMP This code cleanup was missed in the original kdb merge, and this code is simply not used at all. The code that was previously used to set the KDB_FLAG_ONLY_DO_DUMP was removed prior to the initial kdb merge. Signed-off-by: Jason Wessel --- include/linux/kdb.h | 2 -- kernel/debug/kdb/kdb_main.c | 15 +-------------- 2 files changed, 1 insertion(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 064725854db8..42d9e863a313 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -75,8 +75,6 @@ extern const char *kdb_diemsg; #define KDB_FLAG_CATASTROPHIC (1 << 1) /* A catastrophic event has occurred */ #define KDB_FLAG_CMD_INTERRUPT (1 << 2) /* Previous command was interrupted */ #define KDB_FLAG_NOIPI (1 << 3) /* Do not send IPIs */ -#define KDB_FLAG_ONLY_DO_DUMP (1 << 4) /* Only do a dump, used when - * kdb is off */ #define KDB_FLAG_NO_CONSOLE (1 << 5) /* No console is available, * kdb is disabled */ #define KDB_FLAG_NO_VT_CONSOLE (1 << 6) /* No VT console is available, do diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 1f91413edb87..31df1706b9a9 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -139,11 +139,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); static char *__env[] = { #if defined(CONFIG_SMP) "PROMPT=[%d]kdb> ", - "MOREPROMPT=[%d]more> ", #else "PROMPT=kdb> ", - "MOREPROMPT=more> ", #endif + "MOREPROMPT=more> ", "RADIX=16", "MDCOUNT=8", /* lines of md output */ KDB_PLATFORM_ENV, @@ -1236,18 +1235,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, *cmdbuf = '\0'; *(cmd_hist[cmd_head]) = '\0'; - if (KDB_FLAG(ONLY_DO_DUMP)) { - /* kdb is off but a catastrophic error requires a dump. - * Take the dump and reboot. - * Turn on logging so the kdb output appears in the log - * buffer in the dump. - */ - const char *setargs[] = { "set", "LOGGING", "1" }; - kdb_set(2, setargs); - kdb_reboot(0, NULL); - /*NOTREACHED*/ - } - do_full_getstr: #if defined(CONFIG_SMP) snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), -- cgit v1.3 From e6dab5ffab59e910ec0e3355f4a6f29f7a7be474 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Wed, 11 Jul 2012 18:14:58 +0400 Subject: perf/trace: Add ability to set a target task for events A few events are interesting not only for a current task. For example, sched_stat_* events are interesting for a task which wakes up. For this reason, it will be good if such events will be delivered to a target task too. Now a target task can be set by using __perf_task(). The original idea and a draft patch belongs to Peter Zijlstra. I need these events for profiling sleep times. sched_switch is used for getting callchains and sched_stat_* is used for getting time periods. These events are combined in user space, then it can be analyzed by perf tools. Inspired-by: Peter Zijlstra Cc: Steven Rostedt Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Arun Sharma Signed-off-by: Andrew Vagin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1342016098-213063-1-git-send-email-avagin@openvz.org Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 5 +++-- include/linux/perf_event.h | 3 ++- include/trace/events/sched.h | 4 ++++ include/trace/ftrace.h | 6 +++++- kernel/events/callchain.c | 9 ++++++++- kernel/events/core.c | 30 ++++++++++++++++++++++++++++-- kernel/events/internal.h | 3 ++- kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_kprobe.c | 6 ++++-- kernel/trace/trace_syscalls.c | 4 ++-- kernel/trace/trace_uprobe.c | 2 +- 11 files changed, 60 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index af961d6f7ab1..642928cf57b4 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -306,9 +306,10 @@ extern void *perf_trace_buf_prepare(int size, unsigned short type, static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, - u64 count, struct pt_regs *regs, void *head) + u64 count, struct pt_regs *regs, void *head, + struct task_struct *task) { - perf_tp_event(addr, count, raw_data, size, regs, head, rctx); + perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task); } #endif diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 76c5c8b724a7..7602ccb3f40e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1272,7 +1272,8 @@ static inline bool perf_paranoid_kernel(void) extern void perf_event_init(void); extern void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, struct pt_regs *regs, - struct hlist_head *head, int rctx); + struct hlist_head *head, int rctx, + struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ea7a2035456d..5a8671e8a67f 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -73,6 +73,9 @@ DECLARE_EVENT_CLASS(sched_wakeup_template, __entry->prio = p->prio; __entry->success = success; __entry->target_cpu = task_cpu(p); + ) + TP_perf_assign( + __perf_task(p); ), TP_printk("comm=%s pid=%d prio=%d success=%d target_cpu=%03d", @@ -325,6 +328,7 @@ DECLARE_EVENT_CLASS(sched_stat_template, ) TP_perf_assign( __perf_count(delay); + __perf_task(tsk); ), TP_printk("comm=%s pid=%d delay=%Lu [ns]", diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index c6bc2faaf261..a763888a36f9 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -712,6 +712,9 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call #undef __perf_count #define __perf_count(c) __count = (c) +#undef __perf_task +#define __perf_task(t) __task = (t) + #undef TP_perf_assign #define TP_perf_assign(args...) args @@ -725,6 +728,7 @@ perf_trace_##call(void *__data, proto) \ struct ftrace_raw_##call *entry; \ struct pt_regs __regs; \ u64 __addr = 0, __count = 1; \ + struct task_struct *__task = NULL; \ struct hlist_head *head; \ int __entry_size; \ int __data_size; \ @@ -752,7 +756,7 @@ perf_trace_##call(void *__data, proto) \ \ head = this_cpu_ptr(event_call->perf_events); \ perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ - __count, &__regs, head); \ + __count, &__regs, head, __task); \ } /* diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6581a040f399..98d4597f43d6 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -153,7 +153,8 @@ put_callchain_entry(int rctx) put_recursion_context(__get_cpu_var(callchain_recursion), rctx); } -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs) { int rctx; struct perf_callchain_entry *entry; @@ -178,6 +179,12 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) } if (regs) { + /* + * Disallow cross-task user callchains. + */ + if (event->ctx->task && event->ctx->task != current) + goto exit_put; + perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_user(entry, regs); } diff --git a/kernel/events/core.c b/kernel/events/core.c index f1cf0edeb39a..b7935fcec7d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4039,7 +4039,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - data->callchain = perf_callchain(regs); + data->callchain = perf_callchain(event, regs); if (data->callchain) size += data->callchain->nr; @@ -5209,7 +5209,8 @@ static int perf_tp_event_match(struct perf_event *event, } void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head, int rctx) + struct pt_regs *regs, struct hlist_head *head, int rctx, + struct task_struct *task) { struct perf_sample_data data; struct perf_event *event; @@ -5228,6 +5229,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_swevent_event(event, count, &data, regs); } + /* + * If we got specified a target task, also iterate its context and + * deliver this event there too. + */ + if (task && task != current) { + struct perf_event_context *ctx; + struct trace_entry *entry = record; + + rcu_read_lock(); + ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); + if (!ctx) + goto unlock; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->attr.type != PERF_TYPE_TRACEPOINT) + continue; + if (event->attr.config != entry->type) + continue; + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } +unlock: + rcu_read_unlock(); + } + perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index b0b107f90afc..a096c19f2c2a 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -101,7 +101,8 @@ __output_copy(struct perf_output_handle *handle, } /* Callchain handling */ -extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs); extern int get_callchain_buffers(void); extern void put_callchain_buffers(void); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fee3752ae8f6..8a6d2ee2086c 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -281,7 +281,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) head = this_cpu_ptr(event_function.perf_events); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, - 1, ®s, head); + 1, ®s, head, NULL); #undef ENTRY_SIZE } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b31d3d5699fe..1a2117043bb1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, + entry->ip, 1, regs, head, NULL); } /* Kretprobe profile handler */ @@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, + entry->ret_ip, 1, regs, head, NULL); } #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 96fc73369099..60e4d7875672 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -532,7 +532,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) (unsigned long *)&rec->args); head = this_cpu_ptr(sys_data->enter_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } int perf_sysenter_enable(struct ftrace_event_call *call) @@ -608,7 +608,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->ret = syscall_get_return_value(current, regs); head = this_cpu_ptr(sys_data->exit_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); } int perf_sysexit_enable(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2b36ac68549e..03003cd7dd96 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); out: preempt_enable(); -- cgit v1.3 From a7ea3bbf5d58f4df2265d312f91d5769eabc8144 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 27 Jul 2012 14:48:09 -0400 Subject: time/jiffies: Allow CLOCK_TICK_RATE to be undefined CLOCK_TICK_RATE is a legacy constant that defines the timer device's granularity. On hardware with particularly coarse granularity, this constant is used to reduce accumulated time error when using jiffies as a clocksource, by calculating the hardware's actual tick length rather then just assuming it is 1sec/HZ. However, for the most part this is unnecessary, as most modern systems don't use jiffies for their clocksource, and their tick device is sufficiently fine grained to avoid major error. Thus, this patch allows an architecture to not define CLOCK_TICK_RATE, in which case ACTHZ defaults to (HZ << 8). Signed-off-by: Catalin Marinas Acked-by: Arnd Bergmann Cc: Richard Cochran Cc: Prarit Bhargava Cc: Andrew Morton [ Commit log & intention tweaks ] Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1343414893-45779-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- include/linux/jiffies.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 265e2c3cbd1c..7d24466fb80b 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -39,9 +39,6 @@ # error Invalid value of HZ. #endif -/* LATCH is used in the interval timer and ftape setup. */ -#define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ - /* Suppose we want to divide two numbers NOM and DEN: NOM/DEN, then we can * improve accuracy by shifting LSH bits, hence calculating: * (NOM << LSH) / DEN @@ -54,8 +51,15 @@ #define SH_DIV(NOM,DEN,LSH) ( (((NOM) / (DEN)) << (LSH)) \ + ((((NOM) % (DEN)) << (LSH)) + (DEN) / 2) / (DEN)) +#ifdef CLOCK_TICK_RATE +/* LATCH is used in the interval timer and ftape setup. */ +# define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ + /* HZ is the requested value. ACTHZ is actual HZ ("<< 8" is for accuracy) */ -#define ACTHZ (SH_DIV (CLOCK_TICK_RATE, LATCH, 8)) +# define ACTHZ (SH_DIV(CLOCK_TICK_RATE, LATCH, 8)) +#else +# define ACTHZ (HZ << 8) +#endif /* TICK_NSEC is the time between ticks in nsec assuming real ACTHZ */ #define TICK_NSEC (SH_DIV (1000000UL * 1000, ACTHZ, 8)) -- cgit v1.3 From 02ab20ae38337b99b5c29c81090f594b8fd61283 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 27 Jul 2012 14:48:10 -0400 Subject: time/jiffies: Rename ACTHZ to SHIFTED_HZ Ingo noted that ACTHZ is a confusing name, and requested it be renamed, so this patch renames ACTHZ to SHIFTED_HZ to better describe it. Signed-off-by: John Stultz Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1343414893-45779-3-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- include/linux/jiffies.h | 21 +++++++++++++-------- include/linux/timex.h | 2 +- kernel/time/jiffies.c | 2 +- kernel/time/ntp.c | 2 +- 4 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 7d24466fb80b..82680541576d 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -55,21 +55,26 @@ /* LATCH is used in the interval timer and ftape setup. */ # define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ -/* HZ is the requested value. ACTHZ is actual HZ ("<< 8" is for accuracy) */ -# define ACTHZ (SH_DIV(CLOCK_TICK_RATE, LATCH, 8)) +/* + * HZ is the requested value. However the CLOCK_TICK_RATE may not allow + * for exactly HZ. So SHIFTED_HZ is high res HZ ("<< 8" is for accuracy) + */ +# define SHIFTED_HZ (SH_DIV(CLOCK_TICK_RATE, LATCH, 8)) #else -# define ACTHZ (HZ << 8) +# define SHIFTED_HZ (HZ << 8) #endif -/* TICK_NSEC is the time between ticks in nsec assuming real ACTHZ */ -#define TICK_NSEC (SH_DIV (1000000UL * 1000, ACTHZ, 8)) +/* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */ +#define TICK_NSEC (SH_DIV(1000000UL * 1000, SHIFTED_HZ, 8)) /* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) -/* TICK_USEC_TO_NSEC is the time between ticks in nsec assuming real ACTHZ and */ -/* a value TUSEC for TICK_USEC (can be set bij adjtimex) */ -#define TICK_USEC_TO_NSEC(TUSEC) (SH_DIV (TUSEC * USER_HZ * 1000, ACTHZ, 8)) +/* + * TICK_USEC_TO_NSEC is the time between ticks in nsec assuming SHIFTED_HZ and + * a value TUSEC for TICK_USEC (can be set bij adjtimex) + */ +#define TICK_USEC_TO_NSEC(TUSEC) (SH_DIV(TUSEC * USER_HZ * 1000, SHIFTED_HZ, 8)) /* some arch's have a small-data section that can be accessed register-relative * but that can only take up to, say, 4-byte variables. jiffies being part of diff --git a/include/linux/timex.h b/include/linux/timex.h index 99bc88b1fc02..7c5ceb20e03a 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -232,7 +232,7 @@ struct timex { * estimated error = NTP dispersion. */ extern unsigned long tick_usec; /* USER_HZ period (usec) */ -extern unsigned long tick_nsec; /* ACTHZ period (nsec) */ +extern unsigned long tick_nsec; /* SHIFTED_HZ period (nsec) */ extern void ntp_init(void); extern void ntp_clear(void); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a470154e0408..46da0537c10b 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -37,7 +37,7 @@ * requested HZ value. It is also not recommended * for "tick-less" systems. */ -#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ)) /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier * conversion, the .shift value could be zero. However diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index b7fbadc5c973..24174b4d669b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock); /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; -/* ACTHZ period (nsecs): */ +/* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; static u64 tick_length; -- cgit v1.3 From 30b678d844af3305cda5953467005cebb5d7b687 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 30 Jul 2012 15:57:00 +0000 Subject: net: Allow driver to limit number of GSO segments per skb A peer (or local user) may cause TCP to use a nominal MSS of as little as 88 (actual MSS of 76 with timestamps). Given that we have a sufficiently prodigious local sender and the peer ACKs quickly enough, it is nevertheless possible to grow the window for such a connection to the point that we will try to send just under 64K at once. This results in a single skb that expands to 861 segments. In some drivers with TSO support, such an skb will require hundreds of DMA descriptors; a substantial fraction of a TX ring or even more than a full ring. The TX queue selected for the skb may stall and trigger the TX watchdog repeatedly (since the problem skb will be retried after the TX reset). This particularly affects sfc, for which the issue is designated as CVE-2012-3412. Therefore: 1. Add the field net_device::gso_max_segs holding the device-specific limit. 2. In netif_skb_features(), if the number of segments is too high then mask out GSO features to force fall back to software GSO. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eb06e58bed0b..a9db4f33407f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1300,6 +1300,8 @@ struct net_device { /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; +#define GSO_MAX_SEGS 65535 + u16 gso_max_segs; #ifdef CONFIG_DCB /* Data Center Bridging netlink ops */ diff --git a/net/core/dev.c b/net/core/dev.c index 0cb3fe8d8e72..f91abf800161 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2134,6 +2134,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) __be16 protocol = skb->protocol; netdev_features_t features = skb->dev->features; + if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs) + features &= ~NETIF_F_GSO_MASK; + if (protocol == htons(ETH_P_8021Q)) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; @@ -5986,6 +5989,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_MAX_SIZE; + dev->gso_max_segs = GSO_MAX_SEGS; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); -- cgit v1.3 From c263c2c1ad615e935d563cd7be11d417f94895d9 Mon Sep 17 00:00:00 2001 From: Rafał Miłecki Date: Mon, 23 Jul 2012 18:20:12 +0200 Subject: bcma: BCM43228 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rafał Miłecki Signed-off-by: John W. Linville --- drivers/bcma/host_pci.c | 1 + drivers/bcma/sprom.c | 4 +++- include/linux/bcma/bcma_driver_chipcommon.h | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c index 11b32d2642df..a6e5672c67e7 100644 --- a/drivers/bcma/host_pci.c +++ b/drivers/bcma/host_pci.c @@ -272,6 +272,7 @@ static DEFINE_PCI_DEVICE_TABLE(bcma_pci_bridge_tbl) = { { PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4331) }, { PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4353) }, { PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4357) }, + { PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4359) }, { PCI_DEVICE(PCI_VENDOR_ID_BROADCOM, 0x4727) }, { 0, }, }; diff --git a/drivers/bcma/sprom.c b/drivers/bcma/sprom.c index 26823d97fd9f..9ea4627dc0c2 100644 --- a/drivers/bcma/sprom.c +++ b/drivers/bcma/sprom.c @@ -507,7 +507,9 @@ static bool bcma_sprom_onchip_available(struct bcma_bus *bus) /* for these chips OTP is always available */ present = true; break; - + case BCMA_CHIP_ID_BCM43228: + present = chip_status & BCMA_CC_CHIPST_43228_OTP_PRESENT; + break; default: present = false; break; diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index 3c80885fa829..d323a4b4143c 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -89,6 +89,12 @@ #define BCMA_CC_CHIPST_4313_OTP_PRESENT 2 #define BCMA_CC_CHIPST_4331_SPROM_PRESENT 2 #define BCMA_CC_CHIPST_4331_OTP_PRESENT 4 +#define BCMA_CC_CHIPST_43228_ILP_DIV_EN 0x00000001 +#define BCMA_CC_CHIPST_43228_OTP_PRESENT 0x00000002 +#define BCMA_CC_CHIPST_43228_SERDES_REFCLK_PADSEL 0x00000004 +#define BCMA_CC_CHIPST_43228_SDIO_MODE 0x00000008 +#define BCMA_CC_CHIPST_43228_SDIO_OTP_PRESENT 0x00000010 +#define BCMA_CC_CHIPST_43228_SDIO_RESET 0x00000020 #define BCMA_CC_CHIPST_4706_PKG_OPTION BIT(0) /* 0: full-featured package 1: low-cost package */ #define BCMA_CC_CHIPST_4706_SFLASH_PRESENT BIT(1) /* 0: parallel, 1: serial flash is present */ #define BCMA_CC_CHIPST_4706_SFLASH_TYPE BIT(2) /* 0: 8b-p/ST-s flash, 1: 16b-p/Atmal-s flash */ -- cgit v1.3 From 095adbb6441172985f5ddc3b9e88cb3191bdeac4 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Tue, 31 Jul 2012 17:41:09 +0200 Subject: ACPI: Only count valid srat memory structures Otherwise you could run into: WARN_ON in numa_register_memblks(), because node_possible_map is zero References: https://bugzilla.novell.com/show_bug.cgi?id=757888 On this machine (ProLiant ML570 G3) the SRAT table contains: - No processor affinities - One memory affinity structure (which is set disabled) CC: Per Jessen CC: Andi Kleen Signed-off-by: Thomas Renninger Signed-off-by: Len Brown --- arch/ia64/kernel/acpi.c | 5 +++-- arch/x86/mm/srat.c | 15 ++++++++------- drivers/acpi/numa.c | 8 +++++--- include/linux/acpi.h | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 6f38b6120d96..440578850ae5 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -497,7 +497,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) srat_num_cpus++; } -void __init +int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { unsigned long paddr, size; @@ -512,7 +512,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) /* Ignore disabled entries */ if (!(ma->flags & ACPI_SRAT_MEM_ENABLED)) - return; + return -1; /* record this node in proximity bitmap */ pxm_bit_set(pxm); @@ -531,6 +531,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) p->size = size; p->nid = pxm; num_node_memblks++; + return 0; } void __init acpi_numa_arch_fixup(void) diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 4599c3e8bcb6..4ddf497ca65b 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -142,23 +142,23 @@ static inline int save_add_info(void) {return 0;} #endif /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ -void __init +int __init acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) { u64 start, end; int node, pxm; if (srat_disabled()) - return; + return -1; if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { bad_srat(); - return; + return -1; } if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) - return; + return -1; if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) - return; + return -1; start = ma->base_address; end = start + ma->length; pxm = ma->proximity_domain; @@ -168,12 +168,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains.\n"); bad_srat(); - return; + return -1; } if (numa_add_memblk(node, start, end) < 0) { bad_srat(); - return; + return -1; } node_set(node, numa_nodes_parsed); @@ -181,6 +181,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1); + return 0; } void __init acpi_numa_arch_fixup(void) {} diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 2a6399345c85..cb31298ca684 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -237,6 +237,8 @@ acpi_parse_processor_affinity(struct acpi_subtable_header *header, return 0; } +static int __initdata parsed_numa_memblks; + static int __init acpi_parse_memory_affinity(struct acpi_subtable_header * header, const unsigned long end) @@ -250,8 +252,8 @@ acpi_parse_memory_affinity(struct acpi_subtable_header * header, acpi_table_print_srat_entry(header); /* let architecture-dependent part to do it */ - acpi_numa_memory_affinity_init(memory_affinity); - + if (!acpi_numa_memory_affinity_init(memory_affinity)) + parsed_numa_memblks++; return 0; } @@ -306,7 +308,7 @@ int __init acpi_numa_init(void) if (cnt < 0) return cnt; - else if (cnt == 0) + else if (!parsed_numa_memblks) return -ENOENT; return 0; } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 3ad510b25283..4f2a76224509 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -96,7 +96,7 @@ void acpi_table_print_madt_entry (struct acpi_subtable_header *madt); void acpi_numa_slit_init (struct acpi_table_slit *slit); void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa); void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa); -void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma); +int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma); void acpi_numa_arch_fixup(void); #ifdef CONFIG_ACPI_HOTPLUG_CPU -- cgit v1.3 From 76582d0a68ab6c46987dd678165c26ad81c44cdb Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 25 Jul 2012 16:24:49 +0200 Subject: iommu: Include linux/types.h The linux/iommu.h header uses types defined in linux/types.h but doesn't include it. Signed-off-by: Thierry Reding Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 54d6d690073c..0a85199191b9 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -20,6 +20,7 @@ #define __LINUX_IOMMU_H #include +#include #define IOMMU_READ (1) #define IOMMU_WRITE (2) -- cgit v1.3 From ba1eabfade3272596000bf3c62b68ca375e48b08 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 3 Aug 2012 15:55:41 +0200 Subject: iommu: Add missing forward declaration in include file The 'struct notifier_block' is not used in linux/iommu.h but not declared anywhere. Add a forward declaration for it. Reported-by: Thierry Reding Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 0a85199191b9..7e83370e6fd2 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -31,6 +31,7 @@ struct iommu_group; struct bus_type; struct device; struct iommu_domain; +struct notifier_block; /* iommu fault flags */ #define IOMMU_FAULT_READ 0x0 -- cgit v1.3 From f0cd2dbb6cf387c11f87265462e370bb5469299e Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 25 Jul 2012 18:11:59 +0300 Subject: vfs: kill write_super and sync_supers Finally we can kill the 'sync_supers' kernel thread along with the '->write_super()' superblock operation because all the users are gone. Now every file-system is supposed to self-manage own superblock and its dirty state. The nice thing about killing this thread is that it improves power management. Indeed, 'sync_supers' is a source of monotonic system wake-ups - it woke up every 5 seconds no matter what - even if there were no dirty superblocks and even if there were no file-systems using this service (e.g., btrfs and journalled ext4 do not need it). So it was wasting power most of the time. And because the thread was in the core of the kernel, all systems had to have it. So I am quite happy to make it go away. Interestingly, this thread is a left-over from the pdflush kernel thread which was a self-forking kernel thread responsible for all the write-back in old Linux kernels. It was turned into per-block device BDI threads, and 'sync_supers' was a left-over. Thus, R.I.P, pdflush as well. Signed-off-by: Artem Bityutskiy Signed-off-by: Al Viro --- fs/super.c | 40 ---------------------------------- include/linux/backing-dev.h | 1 - include/linux/fs.h | 3 --- mm/backing-dev.c | 52 --------------------------------------------- mm/page-writeback.c | 1 - 5 files changed, 97 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index b05cf47463d0..0902cfa6a12e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -536,46 +536,6 @@ void drop_super(struct super_block *sb) EXPORT_SYMBOL(drop_super); -/** - * sync_supers - helper for periodic superblock writeback - * - * Call the write_super method if present on all dirty superblocks in - * the system. This is for the periodic writeback used by most older - * filesystems. For data integrity superblock writeback use - * sync_filesystems() instead. - * - * Note: check the dirty flag before waiting, so we don't - * hold up the sync while mounting a device. (The newly - * mounted device won't need syncing.) - */ -void sync_supers(void) -{ - struct super_block *sb, *p = NULL; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; - if (sb->s_op->write_super && sb->s_dirt) { - sb->s_count++; - spin_unlock(&sb_lock); - - down_read(&sb->s_umount); - if (sb->s_root && sb->s_dirt && (sb->s_flags & MS_BORN)) - sb->s_op->write_super(sb); - up_read(&sb->s_umount); - - spin_lock(&sb_lock); - if (p) - __put_super(p); - p = sb; - } - } - if (p) - __put_super(p); - spin_unlock(&sb_lock); -} - /** * iterate_supers - call function for all active superblocks * @f: function to call diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c97c6b9cd38e..2a9a9abc9126 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -124,7 +124,6 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, void bdi_start_background_writeback(struct backing_dev_info *bdi); int bdi_writeback_thread(void *data); int bdi_has_dirty_io(struct backing_dev_info *bdi); -void bdi_arm_supers_timer(void); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2); diff --git a/include/linux/fs.h b/include/linux/fs.h index 38dba16c4176..aa110476a95b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1491,7 +1491,6 @@ struct sb_writers { struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ - unsigned char s_dirt; unsigned char s_blocksize_bits; unsigned long s_blocksize; loff_t s_maxbytes; /* Max file size */ @@ -1861,7 +1860,6 @@ struct super_operations { int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); - void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_fs) (struct super_block *); int (*unfreeze_fs) (struct super_block *); @@ -2397,7 +2395,6 @@ extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); extern int generic_write_sync(struct file *file, loff_t pos, loff_t count); -extern void sync_supers(void); extern void emergency_sync(void); extern void emergency_remount(void); #ifdef CONFIG_BLOCK diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 6b4718e2ee34..b41823cc05e6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -39,12 +39,6 @@ DEFINE_SPINLOCK(bdi_lock); LIST_HEAD(bdi_list); LIST_HEAD(bdi_pending_list); -static struct task_struct *sync_supers_tsk; -static struct timer_list sync_supers_timer; - -static int bdi_sync_supers(void *); -static void sync_supers_timer_fn(unsigned long); - void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) { if (wb1 < wb2) { @@ -250,12 +244,6 @@ static int __init default_bdi_init(void) { int err; - sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); - BUG_ON(IS_ERR(sync_supers_tsk)); - - setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); - bdi_arm_supers_timer(); - err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -270,46 +258,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) return wb_has_dirty_io(&bdi->wb); } -/* - * kupdated() used to do this. We cannot do it from the bdi_forker_thread() - * or we risk deadlocking on ->s_umount. The longer term solution would be - * to implement sync_supers_bdi() or similar and simply do it from the - * bdi writeback thread individually. - */ -static int bdi_sync_supers(void *unused) -{ - set_user_nice(current, 0); - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - - /* - * Do this periodically, like kupdated() did before. - */ - sync_supers(); - } - - return 0; -} - -void bdi_arm_supers_timer(void) -{ - unsigned long next; - - if (!dirty_writeback_interval) - return; - - next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; - mod_timer(&sync_supers_timer, round_jiffies_up(next)); -} - -static void sync_supers_timer_fn(unsigned long unused) -{ - wake_up_process(sync_supers_tsk); - bdi_arm_supers_timer(); -} - static void wakeup_timer_fn(unsigned long data) { struct backing_dev_info *bdi = (struct backing_dev_info *)data; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e5363f34e025..5ad5ce23c1e0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1532,7 +1532,6 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); - bdi_arm_supers_timer(); return 0; } -- cgit v1.3 From 0d5c3eba2e1e5aa74e097f49bc90b58f607e101c Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 25 Jul 2012 18:12:08 +0300 Subject: vfs: nuke pdflush from comments The pdflush thread is long gone, so this patch removes references to pdflush from vfs comments. Signed-off-by: Artem Bityutskiy Signed-off-by: Al Viro --- fs/bio.c | 2 +- include/linux/writeback.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/bio.c b/fs/bio.c index 73922abba832..5eaa70c9d96e 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1312,7 +1312,7 @@ EXPORT_SYMBOL(bio_copy_kern); * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. - * But other code (eg, pdflush) could clean the pages if they are mapped + * But other code (eg, flusher threads) could clean the pages if they are mapped * pagecache. * * Simply disabling the call to bio_set_pages_dirty() is a good way to test the diff --git a/include/linux/writeback.h b/include/linux/writeback.h index c66fe3332d83..50c3e8fa06a8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -104,7 +104,6 @@ static inline void wait_on_inode(struct inode *inode) wait_on_bit(&inode->i_state, __I_NEW, inode_wait, TASK_UNINTERRUPTIBLE); } - /* * mm/page-writeback.c */ -- cgit v1.3 From 5d299f3d3c8a2fbc732b1bf03af36333ccec3130 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Aug 2012 05:09:33 +0000 Subject: net: ipv6: fix TCP early demux IPv6 needs a cookie in dst_check() call. We need to add rx_dst_cookie and provide a family independent sk_rx_dst_set(sk, skb) method to properly support IPv6 TCP early demux. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/net/inet_connection_sock.h | 1 + include/net/inet_sock.h | 9 --------- net/ipv4/tcp_input.c | 4 +++- net/ipv4/tcp_ipv4.c | 13 ++++++++++--- net/ipv4/tcp_minisocks.c | 2 +- net/ipv6/tcp_ipv6.c | 27 +++++++++++++++++++++++++-- 7 files changed, 41 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 379e433e15e0..879db26ec401 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -369,6 +369,7 @@ struct ipv6_pinfo { __u8 rcv_tclass; __u32 dst_cookie; + __u32 rx_dst_cookie; struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 5ee66f517b4f..ba1d3615acbb 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -39,6 +39,7 @@ struct inet_connection_sock_af_ops { int (*queue_xmit)(struct sk_buff *skb, struct flowi *fl); void (*send_check)(struct sock *sk, struct sk_buff *skb); int (*rebuild_header)(struct sock *sk); + void (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb); int (*conn_request)(struct sock *sk, struct sk_buff *skb); struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, struct request_sock *req, diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 83b567fe1941..613cfa401672 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -249,13 +249,4 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk) return flags; } -static inline void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - - dst_hold(dst); - sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; -} - #endif /* _INET_SOCK_H */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2fd2bc9e3c64..85308b90df80 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5392,6 +5392,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); + if (unlikely(sk->sk_rx_dst == NULL)) + inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* * Header prediction. * The code loosely follows the one in the famous @@ -5605,7 +5607,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_ESTABLISHED); if (skb != NULL) { - inet_sk_rx_dst_set(sk, skb); + icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); security_inet_conn_established(sk, skb); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 42b2a6a73092..272241f16fcb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1627,9 +1627,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - if (unlikely(sk->sk_rx_dst == NULL)) - inet_sk_rx_dst_set(sk, skb); - if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; @@ -1872,10 +1869,20 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; +static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; +} + const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .net_header_len = sizeof(struct iphdr), diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 232a90c3ec86..d9c9dcef2de3 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -387,7 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - inet_sk_rx_dst_set(newsk, skb); + newicsk->icsk_af_ops->sk_rx_dst_set(newsk, skb); /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c66b90f71c9b..5a439e9a4c01 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1447,7 +1447,17 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC)); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + struct dst_entry *dst = sk->sk_rx_dst; + sock_rps_save_rxhash(sk, skb); + if (dst) { + if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + dst->ops->check(dst, np->rx_dst_cookie) == NULL) { + dst_release(dst); + sk->sk_rx_dst = NULL; + } + } + if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) goto reset; if (opt_skb) @@ -1705,9 +1715,9 @@ static void tcp_v6_early_demux(struct sk_buff *skb) struct dst_entry *dst = sk->sk_rx_dst; struct inet_sock *icsk = inet_sk(sk); if (dst) - dst = dst_check(dst, 0); + dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); if (dst && - icsk->rx_dst_ifindex == inet6_iif(skb)) + icsk->rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } @@ -1719,10 +1729,23 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; +static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct rt6_info *rt = (const struct rt6_info *)dst; + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + if (rt->rt6i_node) + inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; +} + static const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, + .sk_rx_dst_set = inet6_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct ipv6hdr), -- cgit v1.3 From 035534ed3377d9def2c17717899fd64a111a785b Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 6 Aug 2012 18:02:29 +0200 Subject: canfd: remove redundant CAN FD flag The first idea of the CAN FD implementation started with a new struct canfd_frame to be used for both CAN FD frames and legacy CAN frames. The now mainlined implementation supports both CAN frame types simultaneously and distinguishes them only by their required sizes: CAN_MTU and CANFD_MTU. Only the struct canfd_frame contains a flags element which is needed for the additional CAN FD information. As CAN FD implicitly means that the 'Extened Data Length' mode is enabled the formerly defined CANFD_EDL bit became redundant and also confusing as an unset bit would be an error and would always need to be tested. This patch removes the obsolete CANFD_EDL bit and clarifies the documentation for the use of struct canfd_frame and the CAN FD relevant flags. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/linux/can.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can.h b/include/linux/can.h index 018055efc034..e52958d7c2d1 100644 --- a/include/linux/can.h +++ b/include/linux/can.h @@ -74,20 +74,21 @@ struct can_frame { /* * defined bits for canfd_frame.flags * - * As the default for CAN FD should be to support the high data rate in the - * payload section of the frame (HDR) and to support up to 64 byte in the - * data section (EDL) the bits are only set in the non-default case. - * Btw. as long as there's no real implementation for CAN FD network driver - * these bits are only preliminary. + * The use of struct canfd_frame implies the Extended Data Length (EDL) bit to + * be set in the CAN frame bitstream on the wire. The EDL bit switch turns + * the CAN controllers bitstream processor into the CAN FD mode which creates + * two new options within the CAN FD frame specification: * - * RX: NOHDR/NOEDL - info about received CAN FD frame - * ESI - bit from originating CAN controller - * TX: NOHDR/NOEDL - control per-frame settings if supported by CAN controller - * ESI - bit is set by local CAN controller + * Bit Rate Switch - to indicate a second bitrate is/was used for the payload + * Error State Indicator - represents the error state of the transmitting node + * + * As the CANFD_ESI bit is internally generated by the transmitting CAN + * controller only the CANFD_BRS bit is relevant for real CAN controllers when + * building a CAN FD frame for transmission. Setting the CANFD_ESI bit can make + * sense for virtual CAN interfaces to test applications with echoed frames. */ -#define CANFD_NOHDR 0x01 /* frame without high data rate */ -#define CANFD_NOEDL 0x02 /* frame without extended data length */ -#define CANFD_ESI 0x04 /* error state indicator */ +#define CANFD_BRS 0x01 /* bit rate switch (second bitrate for payload data) */ +#define CANFD_ESI 0x02 /* error state indicator of the transmitting node */ /** * struct canfd_frame - CAN flexible data rate frame structure -- cgit v1.3 From 817fea2df3c24b22f6123dc0106eb063b7132883 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 7 Aug 2012 11:48:33 -0600 Subject: vfio: Include vfio.h in installed headers Signed-off-by: Alex Williamson --- include/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index d9a754474878..fa217607c582 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -391,6 +391,7 @@ header-y += v4l2-dv-timings.h header-y += v4l2-mediabus.h header-y += v4l2-subdev.h header-y += veth.h +header-y += vfio.h header-y += vhost.h header-y += videodev2.h header-y += virtio_9p.h -- cgit v1.3 From 300d3739e873d50d4c6e3656f89007a217fb1d29 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 7 Aug 2012 13:50:22 +0200 Subject: Revert "NMI watchdog: fix for lockup detector breakage on resume" Revert commit 45226e9 (NMI watchdog: fix for lockup detector breakage on resume) which breaks resume from system suspend on my SH7372 Mackerel board (by causing a NULL pointer dereference to happen) and is generally wrong, because it abuses the CPU hotplug functionality in a shamelessly blatant way. The original issue should be addressed through appropriate syscore resume callback instead. Signed-off-by: Rafael J. Wysocki --- include/linux/sched.h | 8 -------- kernel/power/suspend.c | 3 --- kernel/watchdog.c | 21 ++------------------- 3 files changed, 2 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c147e7024f11..b8c86648a2f9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -334,14 +334,6 @@ static inline void lockup_detector_init(void) } #endif -#if defined(CONFIG_LOCKUP_DETECTOR) && defined(CONFIG_SUSPEND) -void lockup_detector_bootcpu_resume(void); -#else -static inline void lockup_detector_bootcpu_resume(void) -{ -} -#endif - #ifdef CONFIG_DETECT_HUNG_TASK extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_check_count; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1da39ea248fd..c8b7446b27df 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -178,9 +178,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); - /* Kick the lockup detector */ - lockup_detector_bootcpu_resume(); - Enable_cpus: enable_nonboot_cpus(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 69add8a9da68..4b1dfba70f7c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -575,7 +575,7 @@ out: /* * Create/destroy watchdog threads as CPUs come and go: */ -static int +static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; @@ -610,27 +610,10 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block cpu_nfb = { +static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -#ifdef CONFIG_SUSPEND -/* - * On exit from suspend we force an offline->online transition on the boot CPU - * so that the PMU state that was lost while in suspended state gets set up - * properly for the boot CPU. This information is required for restarting the - * NMI watchdog. - */ -void lockup_detector_bootcpu_resume(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - - cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu); - cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu); - cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu); -} -#endif - void __init lockup_detector_init(void) { void *cpu = (void *)(long)smp_processor_id(); -- cgit v1.3 From 9d8dad742ad1c74d7e7210ee05d0b44961d5ea16 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 9 Aug 2012 19:01:26 -0700 Subject: Yama: higher restrictions should block PTRACE_TRACEME The higher ptrace restriction levels should be blocking even PTRACE_TRACEME requests. The comments in the LSM documentation are misleading about when the checks happen (the parent does not go through security_ptrace_access_check() on a PTRACE_TRACEME call). Signed-off-by: Kees Cook Cc: stable@vger.kernel.org # 3.5.x and later Signed-off-by: James Morris --- Documentation/security/Yama.txt | 14 +++++++------- include/linux/security.h | 2 -- security/yama/yama_lsm.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/Documentation/security/Yama.txt b/Documentation/security/Yama.txt index e369de2d48cd..dd908cf64ecf 100644 --- a/Documentation/security/Yama.txt +++ b/Documentation/security/Yama.txt @@ -46,14 +46,13 @@ restrictions, it can call prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, ...) so that any otherwise allowed process (even those in external pid namespaces) may attach. -These restrictions do not change how ptrace via PTRACE_TRACEME operates. - -The sysctl settings are: +The sysctl settings (writable only with CAP_SYS_PTRACE) are: 0 - classic ptrace permissions: a process can PTRACE_ATTACH to any other process running under the same uid, as long as it is dumpable (i.e. did not transition uids, start privileged, or have called - prctl(PR_SET_DUMPABLE...) already). + prctl(PR_SET_DUMPABLE...) already). Similarly, PTRACE_TRACEME is + unchanged. 1 - restricted ptrace: a process must have a predefined relationship with the inferior it wants to call PTRACE_ATTACH on. By default, @@ -61,12 +60,13 @@ The sysctl settings are: classic criteria is also met. To change the relationship, an inferior can call prctl(PR_SET_PTRACER, debugger, ...) to declare an allowed debugger PID to call PTRACE_ATTACH on the inferior. + Using PTRACE_TRACEME is unchanged. 2 - admin-only attach: only processes with CAP_SYS_PTRACE may use ptrace - with PTRACE_ATTACH. + with PTRACE_ATTACH, or through children calling PTRACE_TRACEME. -3 - no attach: no processes may use ptrace with PTRACE_ATTACH. Once set, - this sysctl cannot be changed to a lower value. +3 - no attach: no processes may use ptrace with PTRACE_ATTACH nor via + PTRACE_TRACEME. Once set, this sysctl value cannot be changed. The original children-only logic was based on the restrictions in grsecurity. diff --git a/include/linux/security.h b/include/linux/security.h index 4e5a73cdbbef..3dea6a9d568f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1242,8 +1242,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Check that the @parent process has sufficient permission to trace the * current process before allowing the current process to present itself * to the @parent process for tracing. - * The parent process will still have to undergo the ptrace_access_check - * checks before it is allowed to trace this one. * @parent contains the task_struct structure for debugger process. * Return 0 if permission is granted. * @capget: diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 83554ee8a587..d51b7c76c37d 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -290,10 +290,51 @@ static int yama_ptrace_access_check(struct task_struct *child, return rc; } +/** + * yama_ptrace_traceme - validate PTRACE_TRACEME calls + * @parent: task that will become the ptracer of the current task + * + * Returns 0 if following the ptrace is allowed, -ve on error. + */ +static int yama_ptrace_traceme(struct task_struct *parent) +{ + int rc; + + /* If standard caps disallows it, so does Yama. We should + * only tighten restrictions further. + */ + rc = cap_ptrace_traceme(parent); + if (rc) + return rc; + + /* Only disallow PTRACE_TRACEME on more aggressive settings. */ + switch (ptrace_scope) { + case YAMA_SCOPE_CAPABILITY: + if (!ns_capable(task_user_ns(parent), CAP_SYS_PTRACE)) + rc = -EPERM; + break; + case YAMA_SCOPE_NO_ATTACH: + rc = -EPERM; + break; + } + + if (rc) { + char name[sizeof(current->comm)]; + printk_ratelimited(KERN_NOTICE + "ptraceme of pid %d was attempted by: %s (pid %d)\n", + current->pid, + get_task_comm(name, parent), + parent->pid); + } + + return rc; +} + static struct security_operations yama_ops = { .name = "yama", .ptrace_access_check = yama_ptrace_access_check, + .ptrace_traceme = yama_ptrace_traceme, .task_prctl = yama_task_prctl, .task_free = yama_task_free, }; -- cgit v1.3