summaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig12
-rw-r--r--kernel/trace/blktrace.c533
-rw-r--r--kernel/trace/bpf_trace.c48
-rw-r--r--kernel/trace/ftrace.c17
4 files changed, 466 insertions, 144 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d2c79da81e4f..4661b9e606e0 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -80,6 +80,12 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
If the architecture generates __patchable_function_entries sections
but does not want them included in the ftrace locations.
+config HAVE_DYNAMIC_FTRACE_WITH_JMP
+ bool
+ help
+ If the architecture supports to replace the __fentry__ with a
+ "jmp" instruction.
+
config HAVE_SYSCALL_TRACEPOINTS
bool
help
@@ -330,6 +336,12 @@ config DYNAMIC_FTRACE_WITH_ARGS
depends on DYNAMIC_FTRACE
depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+config DYNAMIC_FTRACE_WITH_JMP
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ depends on HAVE_DYNAMIC_FTRACE_WITH_JMP
+
config FPROBE
bool "Kernel Function Probe (fprobe)"
depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 6941145b5058..af8cbc8e1a7c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -63,13 +63,116 @@ static int blk_probes_ref;
static void blk_register_tracepoints(void);
static void blk_unregister_tracepoints(void);
+static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu,
+ sector_t sector, int bytes, u64 what,
+ dev_t dev, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data, int pdu_len)
+
+{
+ /*
+ * These two are not needed in ftrace as they are in the
+ * generic trace_entry, filled by tracing_generic_entry_update,
+ * but for the trace_event->bin() synthesizer benefit we do it
+ * here too.
+ */
+ t->cpu = cpu;
+ t->pid = pid;
+
+ t->sector = sector;
+ t->bytes = bytes;
+ t->action = lower_32_bits(what);
+ t->device = dev;
+ t->error = error;
+ t->pdu_len = pdu_len + cgid_len;
+
+ if (cgid_len)
+ memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
+ if (pdu_len)
+ memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
+}
+
+static void record_blktrace_event2(struct blk_io_trace2 *t2, pid_t pid, int cpu,
+ sector_t sector, int bytes, u64 what,
+ dev_t dev, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data,
+ int pdu_len)
+{
+ t2->pid = pid;
+ t2->cpu = cpu;
+
+ t2->sector = sector;
+ t2->bytes = bytes;
+ t2->action = what;
+ t2->device = dev;
+ t2->error = error;
+ t2->pdu_len = pdu_len + cgid_len;
+
+ if (cgid_len)
+ memcpy((void *)t2 + sizeof(*t2), &cgid, cgid_len);
+ if (pdu_len)
+ memcpy((void *)t2 + sizeof(*t2) + cgid_len, pdu_data, pdu_len);
+}
+
+static void relay_blktrace_event1(struct blk_trace *bt, unsigned long sequence,
+ pid_t pid, int cpu, sector_t sector, int bytes,
+ u64 what, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data, int pdu_len)
+{
+ struct blk_io_trace *t;
+ size_t trace_len = sizeof(*t) + pdu_len + cgid_len;
+
+ t = relay_reserve(bt->rchan, trace_len);
+ if (!t)
+ return;
+
+ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+ t->sequence = sequence;
+ t->time = ktime_to_ns(ktime_get());
+
+ record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, error,
+ cgid, cgid_len, pdu_data, pdu_len);
+}
+
+static void relay_blktrace_event2(struct blk_trace *bt, unsigned long sequence,
+ pid_t pid, int cpu, sector_t sector,
+ int bytes, u64 what, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data, int pdu_len)
+{
+ struct blk_io_trace2 *t;
+ size_t trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len;
+
+ t = relay_reserve(bt->rchan, trace_len);
+ if (!t)
+ return;
+
+ t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE2_VERSION;
+ t->sequence = sequence;
+ t->time = ktime_to_ns(ktime_get());
+
+ record_blktrace_event2(t, pid, cpu, sector, bytes, what, bt->dev, error,
+ cgid, cgid_len, pdu_data, pdu_len);
+}
+
+static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence,
+ pid_t pid, int cpu, sector_t sector, int bytes,
+ u64 what, int error, u64 cgid,
+ ssize_t cgid_len, void *pdu_data, int pdu_len)
+{
+ if (bt->version == 2)
+ return relay_blktrace_event2(bt, sequence, pid, cpu, sector,
+ bytes, what, error, cgid, cgid_len,
+ pdu_data, pdu_len);
+ return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes,
+ what, error, cgid, cgid_len, pdu_data,
+ pdu_len);
+}
+
/*
* Send out a notify message.
*/
-static void trace_note(struct blk_trace *bt, pid_t pid, int action,
+static void trace_note(struct blk_trace *bt, pid_t pid, u64 action,
const void *data, size_t len, u64 cgid)
{
- struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
struct trace_buffer *buffer = NULL;
unsigned int trace_ctx = 0;
@@ -77,38 +180,30 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
+ action = lower_32_bits(action | (cgid ? __BLK_TN_CGROUP : 0));
if (blk_tracer) {
+ struct blk_io_trace2 *t;
+ size_t trace_len = sizeof(*t) + cgid_len + len;
+
buffer = blk_tr->array_buffer.buffer;
trace_ctx = tracing_gen_ctx_flags(0);
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + len + cgid_len,
- trace_ctx);
+ trace_len, trace_ctx);
if (!event)
return;
t = ring_buffer_event_data(event);
- goto record_it;
+ record_blktrace_event2(t, pid, cpu, 0, 0,
+ action, bt->dev, 0, cgid, cgid_len,
+ (void *)data, len);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
+ return;
}
if (!bt->rchan)
return;
- t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
- if (t) {
- t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
- t->time = ktime_to_ns(ktime_get());
-record_it:
- t->device = bt->dev;
- t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
- t->pid = pid;
- t->cpu = cpu;
- t->pdu_len = len + cgid_len;
- if (cgid_len)
- memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
- memcpy((void *) t + sizeof(*t) + cgid_len, data, len);
-
- if (blk_tracer)
- trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
- }
+ relay_blktrace_event(bt, 0, pid, cpu, 0, 0, action, 0, cgid,
+ cgid_len, (void *)data, len);
}
/*
@@ -182,7 +277,7 @@ void __blk_trace_note_message(struct blk_trace *bt,
}
EXPORT_SYMBOL_GPL(__blk_trace_note_message);
-static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+static int act_log_check(struct blk_trace *bt, u64 what, sector_t sector,
pid_t pid)
{
if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
@@ -213,13 +308,12 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
* blk_io_trace structure and places it in a per-cpu subbuffer.
*/
static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
- const blk_opf_t opf, u32 what, int error,
+ const blk_opf_t opf, u64 what, int error,
int pdu_len, void *pdu_data, u64 cgid)
{
struct task_struct *tsk = current;
struct ring_buffer_event *event = NULL;
struct trace_buffer *buffer = NULL;
- struct blk_io_trace *t;
unsigned long flags = 0;
unsigned long *sequence;
unsigned int trace_ctx = 0;
@@ -228,6 +322,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
bool blk_tracer = blk_tracer_enabled;
ssize_t cgid_len = cgid ? sizeof(cgid) : 0;
const enum req_op op = opf & REQ_OP_MASK;
+ size_t trace_len;
if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
return;
@@ -238,10 +333,47 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
what |= MASK_TC_BIT(opf, META);
what |= MASK_TC_BIT(opf, PREFLUSH);
what |= MASK_TC_BIT(opf, FUA);
- if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
+
+ switch (op) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
what |= BLK_TC_ACT(BLK_TC_DISCARD);
- if (op == REQ_OP_FLUSH)
+ break;
+ case REQ_OP_FLUSH:
what |= BLK_TC_ACT(BLK_TC_FLUSH);
+ break;
+ case REQ_OP_ZONE_APPEND:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_APPEND);
+ break;
+ case REQ_OP_ZONE_RESET:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_RESET);
+ break;
+ case REQ_OP_ZONE_RESET_ALL:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_RESET_ALL);
+ break;
+ case REQ_OP_ZONE_FINISH:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_FINISH);
+ break;
+ case REQ_OP_ZONE_OPEN:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_OPEN);
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE);
+ break;
+ case REQ_OP_WRITE_ZEROES:
+ what |= BLK_TC_ACT(BLK_TC_WRITE_ZEROES);
+ break;
+ default:
+ break;
+ }
+
+ /* Drop trace events for zone operations with blktrace v1 */
+ if (bt->version == 1 && (what >> BLK_TC_SHIFT) > BLK_TC_END_V1) {
+ pr_debug_ratelimited("blktrace v1 cannot trace zone operation 0x%llx\n",
+ (unsigned long long)what);
+ return;
+ }
+
if (cgid)
what |= __BLK_TA_CGROUP;
@@ -255,13 +387,68 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
buffer = blk_tr->array_buffer.buffer;
trace_ctx = tracing_gen_ctx_flags(0);
+ switch (bt->version) {
+ case 1:
+ trace_len = sizeof(struct blk_io_trace);
+ break;
+ case 2:
+ default:
+ /*
+ * ftrace always uses v2 (blk_io_trace2) format.
+ *
+ * For sysfs-enabled tracing path (enabled via
+ * /sys/block/DEV/trace/enable), blk_trace_setup_queue()
+ * never initializes bt->version, leaving it 0 from
+ * kzalloc(). We must handle version==0 safely here.
+ *
+ * Fall through to default to ensure we never hit the
+ * old bug where default set trace_len=0, causing
+ * buffer underflow and memory corruption.
+ *
+ * Always use v2 format for ftrace and normalize
+ * bt->version to 2 when uninitialized.
+ */
+ trace_len = sizeof(struct blk_io_trace2);
+ if (bt->version == 0)
+ bt->version = 2;
+ break;
+ }
+ trace_len += pdu_len + cgid_len;
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + pdu_len + cgid_len,
- trace_ctx);
+ trace_len, trace_ctx);
if (!event)
return;
- t = ring_buffer_event_data(event);
- goto record_it;
+
+ switch (bt->version) {
+ case 1:
+ record_blktrace_event(ring_buffer_event_data(event),
+ pid, cpu, sector, bytes,
+ what, bt->dev, error, cgid, cgid_len,
+ pdu_data, pdu_len);
+ break;
+ case 2:
+ default:
+ /*
+ * Use v2 recording function (record_blktrace_event2)
+ * which writes blk_io_trace2 structure with correct
+ * field layout:
+ * - 32-bit pid at offset 28
+ * - 64-bit action at offset 32
+ *
+ * Fall through to default handles version==0 case
+ * (from sysfs path), ensuring we always use correct
+ * v2 recording function to match the v2 buffer
+ * allocated above.
+ */
+ record_blktrace_event2(ring_buffer_event_data(event),
+ pid, cpu, sector, bytes,
+ what, bt->dev, error, cgid, cgid_len,
+ pdu_data, pdu_len);
+ break;
+ }
+
+ trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
+ return;
}
if (unlikely(tsk->btrace_seq != blktrace_seq))
@@ -273,41 +460,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
* from coming in and stepping on our toes.
*/
local_irq_save(flags);
- t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len);
- if (t) {
- sequence = per_cpu_ptr(bt->sequence, cpu);
-
- t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
- t->sequence = ++(*sequence);
- t->time = ktime_to_ns(ktime_get());
-record_it:
- /*
- * These two are not needed in ftrace as they are in the
- * generic trace_entry, filled by tracing_generic_entry_update,
- * but for the trace_event->bin() synthesizer benefit we do it
- * here too.
- */
- t->cpu = cpu;
- t->pid = pid;
-
- t->sector = sector;
- t->bytes = bytes;
- t->action = what;
- t->device = bt->dev;
- t->error = error;
- t->pdu_len = pdu_len + cgid_len;
-
- if (cgid_len)
- memcpy((void *)t + sizeof(*t), &cgid, cgid_len);
- if (pdu_len)
- memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len);
-
- if (blk_tracer) {
- trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx);
- return;
- }
- }
-
+ sequence = per_cpu_ptr(bt->sequence, cpu);
+ (*sequence)++;
+ relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes,
+ what, error, cgid, cgid_len, pdu_data, pdu_len);
local_irq_restore(flags);
}
@@ -494,9 +650,10 @@ static void blk_trace_setup_lba(struct blk_trace *bt,
/*
* Setup everything required to start tracing
*/
-static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- struct blk_user_trace_setup *buts)
+static struct blk_trace *blk_trace_setup_prepare(struct request_queue *q,
+ char *name, dev_t dev,
+ u32 buf_size, u32 buf_nr,
+ struct block_device *bdev)
{
struct blk_trace *bt = NULL;
struct dentry *dir = NULL;
@@ -504,31 +661,19 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
lockdep_assert_held(&q->debugfs_mutex);
- if (!buts->buf_size || !buts->buf_nr)
- return -EINVAL;
-
- strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE);
-
- /*
- * some device names have larger paths - convert the slashes
- * to underscores for this to work as expected
- */
- strreplace(buts->name, '/', '_');
-
/*
* bdev can be NULL, as with scsi-generic, this is a helpful as
* we can be.
*/
if (rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->debugfs_mutex))) {
- pr_warn("Concurrent blktraces are not allowed on %s\n",
- buts->name);
- return -EBUSY;
+ pr_warn("Concurrent blktraces are not allowed on %s\n", name);
+ return ERR_PTR(-EBUSY);
}
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
ret = -ENOMEM;
bt->sequence = alloc_percpu(unsigned long);
@@ -548,7 +693,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (bdev && !bdev_is_partition(bdev))
dir = q->debugfs_dir;
else
- bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root);
+ bt->dir = dir = debugfs_create_dir(name, blk_debugfs_root);
/*
* As blktrace relies on debugfs for its interface the debugfs directory
@@ -556,8 +701,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
* files or directories.
*/
if (IS_ERR_OR_NULL(dir)) {
- pr_warn("debugfs_dir not present for %s so skipping\n",
- buts->name);
+ pr_warn("debugfs_dir not present for %s so skipping\n", name);
ret = -ENOENT;
goto err;
}
@@ -569,17 +713,40 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
- bt->rchan = relay_open("trace", dir, buts->buf_size,
- buts->buf_nr, &blk_relay_callbacks, bt);
+ bt->rchan = relay_open("trace", dir, buf_size, buf_nr,
+ &blk_relay_callbacks, bt);
if (!bt->rchan)
goto err;
+ blk_trace_setup_lba(bt, bdev);
+
+ return bt;
+
+err:
+ blk_trace_free(q, bt);
+
+ return ERR_PTR(ret);
+}
+
+static void blk_trace_setup_finalize(struct request_queue *q,
+ char *name, int version,
+ struct blk_trace *bt,
+ struct blk_user_trace_setup2 *buts)
+
+{
+ strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE2);
+
+ /*
+ * some device names have larger paths - convert the slashes
+ * to underscores for this to work as expected
+ */
+ strreplace(buts->name, '/', '_');
+
+ bt->version = version;
bt->act_mask = buts->act_mask;
if (!bt->act_mask)
bt->act_mask = (u16) -1;
- blk_trace_setup_lba(bt, bdev);
-
/* overwrite with user settings */
if (buts->start_lba)
bt->start_lba = buts->start_lba;
@@ -591,30 +758,43 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
-
- ret = 0;
-err:
- if (ret)
- blk_trace_free(q, bt);
- return ret;
}
int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct block_device *bdev,
char __user *arg)
{
+ struct blk_user_trace_setup2 buts2;
struct blk_user_trace_setup buts;
+ struct blk_trace *bt;
int ret;
ret = copy_from_user(&buts, arg, sizeof(buts));
if (ret)
return -EFAULT;
+ if (!buts.buf_size || !buts.buf_nr)
+ return -EINVAL;
+
+ buts2 = (struct blk_user_trace_setup2) {
+ .act_mask = buts.act_mask,
+ .buf_size = buts.buf_size,
+ .buf_nr = buts.buf_nr,
+ .start_lba = buts.start_lba,
+ .end_lba = buts.end_lba,
+ .pid = buts.pid,
+ };
+
mutex_lock(&q->debugfs_mutex);
- ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr,
+ bdev);
+ if (IS_ERR(bt)) {
+ mutex_unlock(&q->debugfs_mutex);
+ return PTR_ERR(bt);
+ }
+ blk_trace_setup_finalize(q, name, 1, bt, &buts2);
+ strcpy(buts.name, buts2.name);
mutex_unlock(&q->debugfs_mutex);
- if (ret)
- return ret;
if (copy_to_user(arg, &buts, sizeof(buts))) {
blk_trace_remove(q);
@@ -624,19 +804,54 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
}
EXPORT_SYMBOL_GPL(blk_trace_setup);
+static int blk_trace_setup2(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev, char __user *arg)
+{
+ struct blk_user_trace_setup2 buts2;
+ struct blk_trace *bt;
+
+ if (copy_from_user(&buts2, arg, sizeof(buts2)))
+ return -EFAULT;
+
+ if (!buts2.buf_size || !buts2.buf_nr)
+ return -EINVAL;
+
+ if (buts2.flags != 0)
+ return -EINVAL;
+
+ mutex_lock(&q->debugfs_mutex);
+ bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr,
+ bdev);
+ if (IS_ERR(bt)) {
+ mutex_unlock(&q->debugfs_mutex);
+ return PTR_ERR(bt);
+ }
+ blk_trace_setup_finalize(q, name, 2, bt, &buts2);
+ mutex_unlock(&q->debugfs_mutex);
+
+ if (copy_to_user(arg, &buts2, sizeof(buts2))) {
+ blk_trace_remove(q);
+ return -EFAULT;
+ }
+ return 0;
+}
+
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
static int compat_blk_trace_setup(struct request_queue *q, char *name,
dev_t dev, struct block_device *bdev,
char __user *arg)
{
- struct blk_user_trace_setup buts;
+ struct blk_user_trace_setup2 buts2;
struct compat_blk_user_trace_setup cbuts;
- int ret;
+ struct blk_trace *bt;
if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
return -EFAULT;
- buts = (struct blk_user_trace_setup) {
+ if (!cbuts.buf_size || !cbuts.buf_nr)
+ return -EINVAL;
+
+ buts2 = (struct blk_user_trace_setup2) {
.act_mask = cbuts.act_mask,
.buf_size = cbuts.buf_size,
.buf_nr = cbuts.buf_nr,
@@ -646,12 +861,16 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
};
mutex_lock(&q->debugfs_mutex);
- ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr,
+ bdev);
+ if (IS_ERR(bt)) {
+ mutex_unlock(&q->debugfs_mutex);
+ return PTR_ERR(bt);
+ }
+ blk_trace_setup_finalize(q, name, 1, bt, &buts2);
mutex_unlock(&q->debugfs_mutex);
- if (ret)
- return ret;
- if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
+ if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) {
blk_trace_remove(q);
return -EFAULT;
}
@@ -707,6 +926,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
char b[BDEVNAME_SIZE];
switch (cmd) {
+ case BLKTRACESETUP2:
+ snprintf(b, sizeof(b), "%pg", bdev);
+ ret = blk_trace_setup2(q, b, bdev->bd_dev, bdev, arg);
+ break;
case BLKTRACESETUP:
snprintf(b, sizeof(b), "%pg", bdev);
ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
@@ -794,7 +1017,7 @@ blk_trace_request_get_cgid(struct request *rq)
*
**/
static void blk_add_trace_rq(struct request *rq, blk_status_t error,
- unsigned int nr_bytes, u32 what, u64 cgid)
+ unsigned int nr_bytes, u64 what, u64 cgid)
{
struct blk_trace *bt;
@@ -846,6 +1069,22 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
blk_trace_request_get_cgid(rq));
}
+static void blk_add_trace_zone_update_request(void *ignore, struct request *rq)
+{
+ struct blk_trace *bt;
+
+ rcu_read_lock();
+ bt = rcu_dereference(rq->q->blk_trace);
+ if (likely(!bt) || bt->version < 2) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ZONE_APPEND,
+ blk_trace_request_get_cgid(rq));
+}
+
/**
* blk_add_trace_bio - Add a trace for a bio oriented action
* @q: queue the io is for
@@ -858,7 +1097,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
*
**/
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what, int error)
+ u64 what, int error)
{
struct blk_trace *bt;
@@ -924,7 +1163,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(depth);
- u32 what;
+ u64 what;
if (explicit)
what = BLK_TA_UNPLUG_IO;
@@ -936,6 +1175,37 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
rcu_read_unlock();
}
+static void blk_add_trace_zone_plug(void *ignore, struct request_queue *q,
+ unsigned int zno, sector_t sector,
+ unsigned int sectors)
+{
+ struct blk_trace *bt;
+
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (bt && bt->version >= 2)
+ __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0,
+ BLK_TA_ZONE_PLUG, 0, 0, NULL, 0);
+ rcu_read_unlock();
+
+ return;
+}
+
+static void blk_add_trace_zone_unplug(void *ignore, struct request_queue *q,
+ unsigned int zno, sector_t sector,
+ unsigned int sectors)
+{
+ struct blk_trace *bt;
+
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (bt && bt->version >= 2)
+ __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0,
+ BLK_TA_ZONE_UNPLUG, 0, 0, NULL, 0);
+ rcu_read_unlock();
+ return;
+}
+
static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
@@ -1076,6 +1346,15 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
WARN_ON(ret);
+ ret = register_trace_blk_zone_append_update_request_bio(
+ blk_add_trace_zone_update_request, NULL);
+ WARN_ON(ret);
+ ret = register_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug,
+ NULL);
+ WARN_ON(ret);
+ ret = register_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug,
+ NULL);
+ WARN_ON(ret);
ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
@@ -1095,6 +1374,10 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_split(blk_add_trace_split, NULL);
unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
unregister_trace_block_plug(blk_add_trace_plug, NULL);
+ unregister_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, NULL);
+ unregister_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, NULL);
+ unregister_trace_blk_zone_append_update_request_bio(
+ blk_add_trace_zone_update_request, NULL);
unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
@@ -1113,7 +1396,7 @@ static void blk_unregister_tracepoints(void)
* struct blk_io_tracer formatting routines
*/
-static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
+static void fill_rwbs(char *rwbs, const struct blk_io_trace2 *t)
{
int i = 0;
int tc = t->action >> BLK_TC_SHIFT;
@@ -1128,7 +1411,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
if (tc & BLK_TC_DISCARD)
rwbs[i++] = 'D';
- else if (tc & BLK_TC_WRITE)
+ else if (tc & BLK_TC_WRITE_ZEROES) {
+ rwbs[i++] = 'W';
+ rwbs[i++] = 'Z';
+ } else if (tc & BLK_TC_WRITE)
rwbs[i++] = 'W';
else if (t->bytes)
rwbs[i++] = 'R';
@@ -1148,9 +1434,9 @@ out:
}
static inline
-const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
+const struct blk_io_trace2 *te_blk_io_trace(const struct trace_entry *ent)
{
- return (const struct blk_io_trace *)ent;
+ return (const struct blk_io_trace2 *)ent;
}
static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg)
@@ -1209,7 +1495,7 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act,
unsigned long long ts = iter->ts;
unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
unsigned secs = (unsigned long)ts;
- const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+ const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
@@ -1223,7 +1509,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act,
bool has_cg)
{
char rwbs[RWBS_LEN];
- const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+ const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent);
fill_rwbs(rwbs, t);
if (has_cg) {
@@ -1444,7 +1730,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
{
struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
- const struct blk_io_trace *t;
+ const struct blk_io_trace2 *t;
u16 what;
bool long_act;
blk_log_action_t *log_action;
@@ -1481,8 +1767,8 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
static void blk_trace_synthesize_old_trace(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
- struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
- const int offset = offsetof(struct blk_io_trace, sector);
+ struct blk_io_trace2 *t = (struct blk_io_trace2 *)iter->ent;
+ const int offset = offsetof(struct blk_io_trace2, sector);
struct blk_io_trace old = {
.magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
.time = iter->ts,
@@ -1559,6 +1845,10 @@ static int __init init_blk_tracer(void)
return 1;
}
+ BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) %
+ __alignof__(long));
+ BUILD_BUG_ON(__alignof__(struct blk_io_trace2) % __alignof__(long));
+
return 0;
}
@@ -1667,6 +1957,7 @@ static const struct {
{ BLK_TC_DISCARD, "discard" },
{ BLK_TC_DRV_DATA, "drv_data" },
{ BLK_TC_FUA, "fua" },
+ { BLK_TC_WRITE_ZEROES, "write-zeroes" },
};
static int blk_trace_str2mask(const char *str)
@@ -1880,6 +2171,10 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf)
rwbs[i++] = 'Z';
rwbs[i++] = 'C';
break;
+ case REQ_OP_WRITE_ZEROES:
+ rwbs[i++] = 'W';
+ rwbs[i++] = 'Z';
+ break;
default:
rwbs[i++] = 'N';
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4f87c16d915a..d57727abaade 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2529,7 +2529,7 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
return run_ctx->entry_ip;
}
-static int
+static __always_inline int
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
unsigned long entry_ip, struct ftrace_regs *fregs,
bool is_return, void *data)
@@ -3372,13 +3372,13 @@ typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struc
* direct calls into all the specific callback implementations
* (copy_user_data_sleepable, copy_user_data_nofault, and so on)
*/
-static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size,
+static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u64 doff, u64 size,
const void *unsafe_src,
copy_fn_t str_copy_fn,
struct task_struct *tsk)
{
struct bpf_dynptr_kern *dst;
- u32 chunk_sz, off;
+ u64 chunk_sz, off;
void *dst_slice;
int cnt, err;
char buf[256];
@@ -3392,7 +3392,7 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do
return -E2BIG;
for (off = 0; off < size; off += chunk_sz - 1) {
- chunk_sz = min_t(u32, sizeof(buf), size - off);
+ chunk_sz = min_t(u64, sizeof(buf), size - off);
/* Expect str_copy_fn to return count of copied bytes, including
* zero terminator. Next iteration increment off by chunk_sz - 1 to
* overwrite NUL.
@@ -3409,14 +3409,14 @@ static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 do
return off;
}
-static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff,
- u32 size, const void *unsafe_src,
+static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u64 doff,
+ u64 size, const void *unsafe_src,
copy_fn_t copy_fn, struct task_struct *tsk)
{
struct bpf_dynptr_kern *dst;
void *dst_slice;
char buf[256];
- u32 off, chunk_sz;
+ u64 off, chunk_sz;
int err;
dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size);
@@ -3428,7 +3428,7 @@ static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32
return -E2BIG;
for (off = 0; off < size; off += chunk_sz) {
- chunk_sz = min_t(u32, sizeof(buf), size - off);
+ chunk_sz = min_t(u64, sizeof(buf), size - off);
err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk);
if (err)
return err;
@@ -3514,58 +3514,58 @@ __bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid
return bpf_send_signal_common(sig, type, task, value);
}
-__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void __user *unsafe_ptr__ign)
+__bpf_kfunc int bpf_probe_read_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_data_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void *unsafe_ptr__ign)
+__bpf_kfunc int bpf_probe_read_kernel_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, unsafe_ptr__ign,
copy_kernel_data_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void __user *unsafe_ptr__ign)
+__bpf_kfunc int bpf_probe_read_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_str_nofault, NULL);
}
-__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void *unsafe_ptr__ign)
+__bpf_kfunc int bpf_probe_read_kernel_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, unsafe_ptr__ign,
copy_kernel_str_nofault, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void __user *unsafe_ptr__ign)
+__bpf_kfunc int bpf_copy_from_user_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_data_sleepable, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void __user *unsafe_ptr__ign)
+__bpf_kfunc int bpf_copy_from_user_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_str_sleepable, NULL);
}
-__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void __user *unsafe_ptr__ign,
+__bpf_kfunc int bpf_copy_from_user_task_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
return __bpf_dynptr_copy(dptr, off, size, (const void *)unsafe_ptr__ign,
copy_user_data_sleepable, tsk);
}
-__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u32 off,
- u32 size, const void __user *unsafe_ptr__ign,
+__bpf_kfunc int bpf_copy_from_user_task_str_dynptr(struct bpf_dynptr *dptr, u64 off,
+ u64 size, const void __user *unsafe_ptr__ign,
struct task_struct *tsk)
{
return __bpf_dynptr_copy_str(dptr, off, size, (const void *)unsafe_ptr__ign,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 59cfacb8a5bb..bbb37c0f8c6c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5951,7 +5951,8 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
del = __ftrace_lookup_ip(direct_functions, entry->ip);
- if (del && del->direct == addr) {
+ if (del && ftrace_jmp_get(del->direct) ==
+ ftrace_jmp_get(addr)) {
remove_hash_entry(direct_functions, del);
kfree(del);
}
@@ -6016,8 +6017,15 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
if (ftrace_hash_empty(hash))
return -EINVAL;
+ /* This is a "raw" address, and this should never happen. */
+ if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
+ return -EINVAL;
+
mutex_lock(&direct_mutex);
+ if (ops->flags & FTRACE_OPS_FL_JMP)
+ addr = ftrace_jmp_set(addr);
+
/* Make sure requested entries are not already registered.. */
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
@@ -6138,6 +6146,13 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
lockdep_assert_held_once(&direct_mutex);
+ /* This is a "raw" address, and this should never happen. */
+ if (WARN_ON_ONCE(ftrace_is_jmp(addr)))
+ return -EINVAL;
+
+ if (ops->flags & FTRACE_OPS_FL_JMP)
+ addr = ftrace_jmp_set(addr);
+
/* Enable the tmp_ops to have the same functions as the direct ops */
ftrace_ops_init(&tmp_ops);
tmp_ops.func_hash = ops->func_hash;