De-duplicate the calculation of the trace length instead of doing the calculation twice, once for calling trace_buffer_lock_reserve() and once for calling relay_reserve(). Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn --- kernel/trace/blktrace.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 6941145b5058..bc4b885f2cec 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -76,13 +76,14 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; + size_t trace_len; + trace_len = sizeof(*t) + cgid_len + len; if (blk_tracer) { buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len + cgid_len, - trace_ctx); + trace_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); @@ -92,7 +93,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!bt->rchan) return; - t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); + t = relay_reserve(bt->rchan, trace_len); if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); @@ -228,6 +229,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; const enum req_op op = opf & REQ_OP_MASK; + size_t trace_len; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -250,14 +252,14 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; cpu = raw_smp_processor_id(); + trace_len = sizeof(*t) + pdu_len + cgid_len; if (blk_tracer) { tracing_record_cmdline(current); buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len + cgid_len, - trace_ctx); + trace_len, trace_ctx); if (!event) return; t = ring_buffer_event_data(event); @@ -273,7 +275,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); + t = relay_reserve(bt->rchan, trace_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); -- 2.51.0 Factor out the recording of a blktrace event into its own function, deduplicating the code. This also enables recording different versions of the blktrace protocol later on. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn --- kernel/trace/blktrace.c | 89 +++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc4b885f2cec..25a0a1b09747 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -63,6 +63,34 @@ static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); +static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, + sector_t sector, int bytes, u32 what, + dev_t dev, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) + +{ + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + + t->sector = sector; + t->bytes = bytes; + t->action = what; + t->device = dev; + t->error = error; + t->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t + sizeof(*t), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); +} + /* * Send out a notify message. */ @@ -87,7 +115,12 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!event) return; t = ring_buffer_event_data(event); - goto record_it; + record_blktrace_event(t, pid, cpu, 0, 0, + action | (cgid ? __BLK_TN_CGROUP : 0), + bt->dev, 0, cgid, cgid_len, (void *)data, + len); + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; } if (!bt->rchan) @@ -97,18 +130,11 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); -record_it: - t->device = bt->dev; - t->action = action | (cgid ? __BLK_TN_CGROUP : 0); - t->pid = pid; - t->cpu = cpu; - t->pdu_len = len + cgid_len; - if (cgid_len) - memcpy((void *)t + sizeof(*t), &cgid, cgid_len); - memcpy((void *) t + sizeof(*t) + cgid_len, data, len); - - if (blk_tracer) - trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + + record_blktrace_event(t, pid, cpu, 0, 0, + action | (cgid ? __BLK_TN_CGROUP : 0), + bt->dev, 0, cgid, cgid_len, (void *)data, + len); } } @@ -263,7 +289,12 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (!event) return; t = ring_buffer_event_data(event); - goto record_it; + + record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, + error, cgid, cgid_len, pdu_data, pdu_len); + + trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); + return; } if (unlikely(tsk->btrace_seq != blktrace_seq)) @@ -282,32 +313,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->sequence = ++(*sequence); t->time = ktime_to_ns(ktime_get()); -record_it: - /* - * These two are not needed in ftrace as they are in the - * generic trace_entry, filled by tracing_generic_entry_update, - * but for the trace_event->bin() synthesizer benefit we do it - * here too. - */ - t->cpu = cpu; - t->pid = pid; - - t->sector = sector; - t->bytes = bytes; - t->action = what; - t->device = bt->dev; - t->error = error; - t->pdu_len = pdu_len + cgid_len; - - if (cgid_len) - memcpy((void *)t + sizeof(*t), &cgid, cgid_len); - if (pdu_len) - memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); - - if (blk_tracer) { - trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); - return; - } + + record_blktrace_event(t, pid, cpu, sector, bytes, what, + bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); } local_irq_restore(flags); -- 2.51.0 Split out the code relaying a blktrace event to user-space using relayfs. This enables adding a second version supporting a new version of the protocol. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn --- kernel/trace/blktrace.c | 60 ++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 25a0a1b09747..51745832c713 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -91,6 +91,26 @@ static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); } +static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, int bytes, + u32 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + struct blk_io_trace *t; + size_t trace_len = sizeof(*t) + pdu_len + cgid_len; + + t = relay_reserve(bt->rchan, trace_len); + if (!t) + return; + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->sequence = sequence; + t->time = ktime_to_ns(ktime_get()); + + record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, error, + cgid, cgid_len, pdu_data, pdu_len); +} + /* * Send out a notify message. */ @@ -126,16 +146,9 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!bt->rchan) return; - t = relay_reserve(bt->rchan, trace_len); - if (t) { - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->time = ktime_to_ns(ktime_get()); - - record_blktrace_event(t, pid, cpu, 0, 0, - action | (cgid ? __BLK_TN_CGROUP : 0), - bt->dev, 0, cgid, cgid_len, (void *)data, - len); - } + relay_blktrace_event(bt, 0, pid, cpu, 0, 0, + action | (cgid ? __BLK_TN_CGROUP : 0), 0, cgid, + cgid_len, (void *)data, len); } /* @@ -246,7 +259,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; - struct blk_io_trace *t; unsigned long flags = 0; unsigned long *sequence; unsigned int trace_ctx = 0; @@ -278,20 +290,21 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; cpu = raw_smp_processor_id(); - trace_len = sizeof(*t) + pdu_len + cgid_len; if (blk_tracer) { tracing_record_cmdline(current); buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); + trace_len = sizeof(struct blk_io_trace) + pdu_len + cgid_len; event = trace_buffer_lock_reserve(buffer, TRACE_BLK, trace_len, trace_ctx); if (!event) return; - t = ring_buffer_event_data(event); - record_blktrace_event(t, pid, cpu, sector, bytes, what, bt->dev, - error, cgid, cgid_len, pdu_data, pdu_len); + record_blktrace_event(ring_buffer_event_data(event), + pid, cpu, sector, bytes, what, bt->dev, + error, cgid, cgid_len, pdu_data, + pdu_len); trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; @@ -306,19 +319,10 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, trace_len); - if (t) { - sequence = per_cpu_ptr(bt->sequence, cpu); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->sequence = ++(*sequence); - t->time = ktime_to_ns(ktime_get()); - - record_blktrace_event(t, pid, cpu, sector, bytes, what, - bt->dev, error, cgid, cgid_len, - pdu_data, pdu_len); - } - + sequence = per_cpu_ptr(bt->sequence, cpu); + (*sequence)++; + relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes, what, + error, cgid, cgid_len, pdu_data, pdu_len); local_irq_restore(flags); } -- 2.51.0 Untangle the if/else sequence setting the trace action in __blk_add_trace() and turn it into a switch statement for better extensibility. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn --- kernel/trace/blktrace.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 51745832c713..11e264f67851 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -278,10 +278,19 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(opf, META); what |= MASK_TC_BIT(opf, PREFLUSH); what |= MASK_TC_BIT(opf, FUA); - if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE) + + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: what |= BLK_TC_ACT(BLK_TC_DISCARD); - if (op == REQ_OP_FLUSH) + break; + case REQ_OP_FLUSH: what |= BLK_TC_ACT(BLK_TC_FLUSH); + break; + default: + break; + } + if (cgid) what |= __BLK_TA_CGROUP; -- 2.51.0 Change the internal use of the action in blktrace to 64bit. Although for now only the lower 32bits will be used. With the upcoming version 2 of the blktrace user-space protocol the upper 32bit will also be utilized. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn --- kernel/trace/blktrace.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 11e264f67851..15d6788700ca 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -64,7 +64,7 @@ static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, - sector_t sector, int bytes, u32 what, + sector_t sector, int bytes, u64 what, dev_t dev, int error, u64 cgid, ssize_t cgid_len, void *pdu_data, int pdu_len) @@ -80,7 +80,7 @@ static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, t->sector = sector; t->bytes = bytes; - t->action = what; + t->action = lower_32_bits(what); t->device = dev; t->error = error; t->pdu_len = pdu_len + cgid_len; @@ -93,7 +93,7 @@ static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, pid_t pid, int cpu, sector_t sector, int bytes, - u32 what, int error, u64 cgid, + u64 what, int error, u64 cgid, ssize_t cgid_len, void *pdu_data, int pdu_len) { struct blk_io_trace *t; @@ -114,7 +114,7 @@ static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, /* * Send out a notify message. */ -static void trace_note(struct blk_trace *bt, pid_t pid, int action, +static void trace_note(struct blk_trace *bt, pid_t pid, u64 action, const void *data, size_t len, u64 cgid) { struct blk_io_trace *t; @@ -127,6 +127,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, size_t trace_len; trace_len = sizeof(*t) + cgid_len + len; + action = lower_32_bits(action | (cgid ? __BLK_TN_CGROUP : 0)); if (blk_tracer) { buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); @@ -136,9 +137,8 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, return; t = ring_buffer_event_data(event); record_blktrace_event(t, pid, cpu, 0, 0, - action | (cgid ? __BLK_TN_CGROUP : 0), - bt->dev, 0, cgid, cgid_len, (void *)data, - len); + action, bt->dev, 0, cgid, cgid_len, + (void *)data, len); trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; } @@ -146,8 +146,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!bt->rchan) return; - relay_blktrace_event(bt, 0, pid, cpu, 0, 0, - action | (cgid ? __BLK_TN_CGROUP : 0), 0, cgid, + relay_blktrace_event(bt, 0, pid, cpu, 0, 0, action, 0, cgid, cgid_len, (void *)data, len); } @@ -222,7 +221,7 @@ void __blk_trace_note_message(struct blk_trace *bt, } EXPORT_SYMBOL_GPL(__blk_trace_note_message); -static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, +static int act_log_check(struct blk_trace *bt, u64 what, sector_t sector, pid_t pid) { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) @@ -253,7 +252,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), * blk_io_trace structure and places it in a per-cpu subbuffer. */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - const blk_opf_t opf, u32 what, int error, + const blk_opf_t opf, u64 what, int error, int pdu_len, void *pdu_data, u64 cgid) { struct task_struct *tsk = current; @@ -311,9 +310,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; record_blktrace_event(ring_buffer_event_data(event), - pid, cpu, sector, bytes, what, bt->dev, - error, cgid, cgid_len, pdu_data, - pdu_len); + pid, cpu, sector, bytes, + what, bt->dev, error, cgid, cgid_len, + pdu_data, pdu_len); trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; @@ -330,8 +329,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, local_irq_save(flags); sequence = per_cpu_ptr(bt->sequence, cpu); (*sequence)++; - relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes, what, - error, cgid, cgid_len, pdu_data, pdu_len); + relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes, + lower_32_bits(what), error, cgid, cgid_len, + pdu_data, pdu_len); local_irq_restore(flags); } @@ -818,7 +818,7 @@ blk_trace_request_get_cgid(struct request *rq) * **/ static void blk_add_trace_rq(struct request *rq, blk_status_t error, - unsigned int nr_bytes, u32 what, u64 cgid) + unsigned int nr_bytes, u64 what, u64 cgid) { struct blk_trace *bt; @@ -882,7 +882,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) + u64 what, int error) { struct blk_trace *bt; @@ -948,7 +948,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); - u32 what; + u64 what; if (explicit) what = BLK_TA_UNPLUG_IO; -- 2.51.0 Split do_blk_trace_setup into two functions, this is done to prepare for an incoming new BLKTRACESETUP2 ioctl(2) which can receive extended parameters from user-space. Also move the size verification logic to the callers in preparation for using a new internal structure later. Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn --- kernel/trace/blktrace.c | 94 ++++++++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 38 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 15d6788700ca..df90422ae613 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -518,9 +518,10 @@ static void blk_trace_setup_lba(struct blk_trace *bt, /* * Setup everything required to start tracing */ -static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - struct blk_user_trace_setup *buts) +static struct blk_trace *blk_trace_setup_prepare(struct request_queue *q, + char *name, dev_t dev, + u32 buf_size, u32 buf_nr, + struct block_device *bdev) { struct blk_trace *bt = NULL; struct dentry *dir = NULL; @@ -528,31 +529,19 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, lockdep_assert_held(&q->debugfs_mutex); - if (!buts->buf_size || !buts->buf_nr) - return -EINVAL; - - strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); - - /* - * some device names have larger paths - convert the slashes - * to underscores for this to work as expected - */ - strreplace(buts->name, '/', '_'); - /* * bdev can be NULL, as with scsi-generic, this is a helpful as * we can be. */ if (rcu_dereference_protected(q->blk_trace, lockdep_is_held(&q->debugfs_mutex))) { - pr_warn("Concurrent blktraces are not allowed on %s\n", - buts->name); - return -EBUSY; + pr_warn("Concurrent blktraces are not allowed on %s\n", name); + return ERR_PTR(-EBUSY); } bt = kzalloc(sizeof(*bt), GFP_KERNEL); if (!bt) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ret = -ENOMEM; bt->sequence = alloc_percpu(unsigned long); @@ -572,7 +561,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (bdev && !bdev_is_partition(bdev)) dir = q->debugfs_dir; else - bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); + bt->dir = dir = debugfs_create_dir(name, blk_debugfs_root); /* * As blktrace relies on debugfs for its interface the debugfs directory @@ -580,8 +569,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * files or directories. */ if (IS_ERR_OR_NULL(dir)) { - pr_warn("debugfs_dir not present for %s so skipping\n", - buts->name); + pr_warn("debugfs_dir not present for %s so skipping\n", name); ret = -ENOENT; goto err; } @@ -593,17 +581,38 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - bt->rchan = relay_open("trace", dir, buts->buf_size, - buts->buf_nr, &blk_relay_callbacks, bt); + bt->rchan = relay_open("trace", dir, buf_size, buf_nr, + &blk_relay_callbacks, bt); if (!bt->rchan) goto err; + blk_trace_setup_lba(bt, bdev); + + return bt; + +err: + blk_trace_free(q, bt); + + return ERR_PTR(ret); +} + +static void blk_trace_setup_finalize(struct request_queue *q, + char *name, struct blk_trace *bt, + struct blk_user_trace_setup *buts) + +{ + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); + + /* + * some device names have larger paths - convert the slashes + * to underscores for this to work as expected + */ + strreplace(buts->name, '/', '_'); + bt->act_mask = buts->act_mask; if (!bt->act_mask) bt->act_mask = (u16) -1; - blk_trace_setup_lba(bt, bdev); - /* overwrite with user settings */ if (buts->start_lba) bt->start_lba = buts->start_lba; @@ -615,12 +624,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, rcu_assign_pointer(q->blk_trace, bt); get_probe_ref(); - - ret = 0; -err: - if (ret) - blk_trace_free(q, bt); - return ret; } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, @@ -628,17 +631,25 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, char __user *arg) { struct blk_user_trace_setup buts; + struct blk_trace *bt; int ret; ret = copy_from_user(&buts, arg, sizeof(buts)); if (ret) return -EFAULT; + if (!buts.buf_size || !buts.buf_nr) + return -EINVAL; + mutex_lock(&q->debugfs_mutex); - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, bt, &buts); mutex_unlock(&q->debugfs_mutex); - if (ret) - return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { blk_trace_remove(q); @@ -655,11 +666,14 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, { struct blk_user_trace_setup buts; struct compat_blk_user_trace_setup cbuts; - int ret; + struct blk_trace *bt; if (copy_from_user(&cbuts, arg, sizeof(cbuts))) return -EFAULT; + if (!cbuts.buf_size || !cbuts.buf_nr) + return -EINVAL; + buts = (struct blk_user_trace_setup) { .act_mask = cbuts.act_mask, .buf_size = cbuts.buf_size, @@ -670,10 +684,14 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, }; mutex_lock(&q->debugfs_mutex); - ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, bt, &buts); mutex_unlock(&q->debugfs_mutex); - if (ret) - return ret; if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { blk_trace_remove(q); -- 2.51.0 Add definitions for a version 2 of the blk_user_trace_setup ioctl. This new ioctl will enable a different struct layout of the binary data passed to user-space when using a new version of the blktrace utility requesting the new struct layout. Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn --- include/uapi/linux/blktrace_api.h | 16 ++++++++++++++++ include/uapi/linux/fs.h | 1 + kernel/trace/blktrace.c | 3 +++ 3 files changed, 20 insertions(+) diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 1bfb635e309b..a6958708d477 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -129,6 +129,7 @@ enum { }; #define BLKTRACE_BDEV_SIZE 32 +#define BLKTRACE_BDEV_SIZE2 64 /* * User setup structure passed with BLKTRACESETUP @@ -143,4 +144,19 @@ struct blk_user_trace_setup { __u32 pid; }; +/* + * User setup structure passed with BLKTRACESETUP2 + */ +struct blk_user_trace_setup2 { + char name[BLKTRACE_BDEV_SIZE2]; /* output */ + __u64 act_mask; /* input */ + __u32 buf_size; /* input */ + __u32 buf_nr; /* input */ + __u64 start_lba; + __u64 end_lba; + __u32 pid; + __u32 flags; /* currently unused */ + __u64 reserved[11]; +}; + #endif /* _UAPIBLKTRACE_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index beb4c2d1e41c..957ce3343a4f 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -300,6 +300,7 @@ struct file_attr { #define BLKGETDISKSEQ _IOR(0x12,128,__u64) /* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ /* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */ +#define BLKTRACESETUP2 _IOWR(0x12, 142, struct blk_user_trace_setup2) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index df90422ae613..c31b8f433116 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1601,6 +1601,9 @@ static int __init init_blk_tracer(void) return 1; } + BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) % + __alignof__(long)); + return 0; } -- 2.51.0 Pass struct blk_user_trace_setup2 to blktrace_setup_finalize(). This prepares for the incoming extension of the blktrace protocol with a 64bit act_mask. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn --- include/linux/blktrace_api.h | 3 ++- kernel/trace/blktrace.c | 31 ++++++++++++++++++++++--------- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 122c62e561fc..05c8754456aa 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -14,11 +14,12 @@ #include struct blk_trace { + int version; int trace_state; struct rchan *rchan; unsigned long __percpu *sequence; unsigned char __percpu *msg_data; - u16 act_mask; + u64 act_mask; u64 start_lba; u64 end_lba; u32 pid; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c31b8f433116..d1532df84cc8 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -597,11 +597,12 @@ static struct blk_trace *blk_trace_setup_prepare(struct request_queue *q, } static void blk_trace_setup_finalize(struct request_queue *q, - char *name, struct blk_trace *bt, - struct blk_user_trace_setup *buts) + char *name, int version, + struct blk_trace *bt, + struct blk_user_trace_setup2 *buts) { - strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE2); /* * some device names have larger paths - convert the slashes @@ -609,6 +610,7 @@ static void blk_trace_setup_finalize(struct request_queue *q, */ strreplace(buts->name, '/', '_'); + bt->version = version; bt->act_mask = buts->act_mask; if (!bt->act_mask) bt->act_mask = (u16) -1; @@ -630,6 +632,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { + struct blk_user_trace_setup2 buts2; struct blk_user_trace_setup buts; struct blk_trace *bt; int ret; @@ -641,6 +644,15 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!buts.buf_size || !buts.buf_nr) return -EINVAL; + buts2 = (struct blk_user_trace_setup2) { + .act_mask = buts.act_mask, + .buf_size = buts.buf_size, + .buf_nr = buts.buf_nr, + .start_lba = buts.start_lba, + .end_lba = buts.end_lba, + .pid = buts.pid, + }; + mutex_lock(&q->debugfs_mutex); bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, bdev); @@ -648,7 +660,8 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, mutex_unlock(&q->debugfs_mutex); return PTR_ERR(bt); } - blk_trace_setup_finalize(q, name, bt, &buts); + blk_trace_setup_finalize(q, name, 1, bt, &buts2); + strcpy(buts.name, buts2.name); mutex_unlock(&q->debugfs_mutex); if (copy_to_user(arg, &buts, sizeof(buts))) { @@ -664,7 +677,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg) { - struct blk_user_trace_setup buts; + struct blk_user_trace_setup2 buts2; struct compat_blk_user_trace_setup cbuts; struct blk_trace *bt; @@ -674,7 +687,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, if (!cbuts.buf_size || !cbuts.buf_nr) return -EINVAL; - buts = (struct blk_user_trace_setup) { + buts2 = (struct blk_user_trace_setup2) { .act_mask = cbuts.act_mask, .buf_size = cbuts.buf_size, .buf_nr = cbuts.buf_nr, @@ -684,16 +697,16 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, }; mutex_lock(&q->debugfs_mutex); - bt = blk_trace_setup_prepare(q, name, dev, buts.buf_size, buts.buf_nr, + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, bdev); if (IS_ERR(bt)) { mutex_unlock(&q->debugfs_mutex); return PTR_ERR(bt); } - blk_trace_setup_finalize(q, name, bt, &buts); + blk_trace_setup_finalize(q, name, 1, bt, &buts2); mutex_unlock(&q->debugfs_mutex); - if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { + if (copy_to_user(arg, &buts2.name, ARRAY_SIZE(buts2.name))) { blk_trace_remove(q); return -EFAULT; } -- 2.51.0 Add definitions for the extended version of the blktrace protocol using a wider action type to be able to record new actions in the kernel. Signed-off-by: Johannes Thumshirn --- include/uapi/linux/blktrace_api.h | 16 ++++++++++++++++ kernel/trace/blktrace.c | 1 + 2 files changed, 17 insertions(+) diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index a6958708d477..3a771b9802aa 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -94,6 +94,7 @@ enum blktrace_notify { #define BLK_IO_TRACE_MAGIC 0x65617400 #define BLK_IO_TRACE_VERSION 0x07 +#define BLK_IO_TRACE2_VERSION 0x08 /* * The trace itself @@ -113,6 +114,21 @@ struct blk_io_trace { /* cgroup id will be stored here if exists */ }; +struct blk_io_trace2 { + __u32 magic; /* MAGIC << 8 | BLK_IO_TRACE2_VERSION */ + __u32 sequence; /* event number */ + __u64 time; /* in nanoseconds */ + __u64 sector; /* disk offset */ + __u32 bytes; /* transfer length */ + __u32 pid; /* who did it */ + __u64 action; /* what happened */ + __u32 device; /* device number */ + __u32 cpu; /* on what cpu did it happen */ + __u16 error; /* completion error */ + __u16 pdu_len; /* length of data after this trace */ + __u8 pad[12]; + /* cgroup id will be stored here if exists */ +}; /* * The remap event */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d1532df84cc8..185f19c9f772 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1616,6 +1616,7 @@ static int __init init_blk_tracer(void) BUILD_BUG_ON(__alignof__(struct blk_user_trace_setup2) % __alignof__(long)); + BUILD_BUG_ON(__alignof__(struct blk_io_trace2) % __alignof__(long)); return 0; } -- 2.51.0 Differentiate between blk_io_trace and blk_io_trace2 when relaying to user-space depending on which version has been requested by the blktrace utility. Reviewed-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn --- kernel/trace/blktrace.c | 62 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 185f19c9f772..074a7d77158c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -91,7 +91,29 @@ static void record_blktrace_event(struct blk_io_trace *t, pid_t pid, int cpu, memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); } -static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, +static void record_blktrace_event2(struct blk_io_trace2 *t2, pid_t pid, int cpu, + sector_t sector, int bytes, u64 what, + dev_t dev, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, + int pdu_len) +{ + t2->pid = pid; + t2->cpu = cpu; + + t2->sector = sector; + t2->bytes = bytes; + t2->action = what; + t2->device = dev; + t2->error = error; + t2->pdu_len = pdu_len + cgid_len; + + if (cgid_len) + memcpy((void *)t2 + sizeof(*t2), &cgid, cgid_len); + if (pdu_len) + memcpy((void *)t2 + sizeof(*t2) + cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event1(struct blk_trace *bt, unsigned long sequence, pid_t pid, int cpu, sector_t sector, int bytes, u64 what, int error, u64 cgid, ssize_t cgid_len, void *pdu_data, int pdu_len) @@ -111,6 +133,40 @@ static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, cgid, cgid_len, pdu_data, pdu_len); } +static void relay_blktrace_event2(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, + int bytes, u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + struct blk_io_trace2 *t; + size_t trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len; + + t = relay_reserve(bt->rchan, trace_len); + if (!t) + return; + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE2_VERSION; + t->sequence = sequence; + t->time = ktime_to_ns(ktime_get()); + + record_blktrace_event2(t, pid, cpu, sector, bytes, what, bt->dev, error, + cgid, cgid_len, pdu_data, pdu_len); +} + +static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, + pid_t pid, int cpu, sector_t sector, int bytes, + u64 what, int error, u64 cgid, + ssize_t cgid_len, void *pdu_data, int pdu_len) +{ + if (bt->version == 2) + return relay_blktrace_event2(bt, sequence, pid, cpu, sector, + bytes, what, error, cgid, cgid_len, + pdu_data, pdu_len); + return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes, + lower_32_bits(what), error, cgid, cgid_len, + pdu_data, pdu_len); +} + /* * Send out a notify message. */ @@ -146,8 +202,8 @@ static void trace_note(struct blk_trace *bt, pid_t pid, u64 action, if (!bt->rchan) return; - relay_blktrace_event(bt, 0, pid, cpu, 0, 0, action, 0, cgid, - cgid_len, (void *)data, len); + relay_blktrace_event(bt, 0, pid, cpu, 0, 0, action, 0, cgid, cgid_len, + (void *)data, len); } /* -- 2.51.0 Move trace_note() to the new blk_io_trace2 infrastructure. Signed-off-by: Johannes Thumshirn --- kernel/trace/blktrace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 074a7d77158c..756d2c7dfdf2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -173,18 +173,18 @@ static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, static void trace_note(struct blk_trace *bt, pid_t pid, u64 action, const void *data, size_t len, u64 cgid) { - struct blk_io_trace *t; struct ring_buffer_event *event = NULL; struct trace_buffer *buffer = NULL; unsigned int trace_ctx = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; ssize_t cgid_len = cgid ? sizeof(cgid) : 0; - size_t trace_len; - trace_len = sizeof(*t) + cgid_len + len; action = lower_32_bits(action | (cgid ? __BLK_TN_CGROUP : 0)); if (blk_tracer) { + struct blk_io_trace2 *t; + size_t trace_len = sizeof(*t) + cgid_len + len; + buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, @@ -192,9 +192,9 @@ static void trace_note(struct blk_trace *bt, pid_t pid, u64 action, if (!event) return; t = ring_buffer_event_data(event); - record_blktrace_event(t, pid, cpu, 0, 0, - action, bt->dev, 0, cgid, cgid_len, - (void *)data, len); + record_blktrace_event2(t, pid, cpu, 0, 0, + action, bt->dev, 0, cgid, cgid_len, + (void *)data, len); trace_buffer_unlock_commit(blk_tr, buffer, event, trace_ctx); return; } @@ -359,7 +359,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, buffer = blk_tr->array_buffer.buffer; trace_ctx = tracing_gen_ctx_flags(0); - trace_len = sizeof(struct blk_io_trace) + pdu_len + cgid_len; + trace_len = sizeof(struct blk_io_trace2) + pdu_len + cgid_len; event = trace_buffer_lock_reserve(buffer, TRACE_BLK, trace_len, trace_ctx); if (!event) -- 2.51.0 Move ftrace's blk_io_tracer to the new blk_io_trace2 infrastructure. Signed-off-by: Johannes Thumshirn --- kernel/trace/blktrace.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 756d2c7dfdf2..8ffb218e9fb7 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1224,7 +1224,7 @@ static void blk_unregister_tracepoints(void) * struct blk_io_tracer formatting routines */ -static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +static void fill_rwbs(char *rwbs, const struct blk_io_trace2 *t) { int i = 0; int tc = t->action >> BLK_TC_SHIFT; @@ -1259,9 +1259,9 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) } static inline -const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +const struct blk_io_trace2 *te_blk_io_trace(const struct trace_entry *ent) { - return (const struct blk_io_trace *)ent; + return (const struct blk_io_trace2 *)ent; } static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) @@ -1320,7 +1320,7 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act, unsigned long long ts = iter->ts; unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); unsigned secs = (unsigned long)ts; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); @@ -1334,7 +1334,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, bool has_cg) { char rwbs[RWBS_LEN]; - const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + const struct blk_io_trace2 *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); if (has_cg) { @@ -1555,7 +1555,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; - const struct blk_io_trace *t; + const struct blk_io_trace2 *t; u16 what; bool long_act; blk_log_action_t *log_action; @@ -1592,8 +1592,8 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, static void blk_trace_synthesize_old_trace(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; - struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; - const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace2 *t = (struct blk_io_trace2 *)iter->ent; + const int offset = offsetof(struct blk_io_trace2, sector); struct blk_io_trace old = { .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, .time = iter->ts, -- 2.51.0 Add block trace commands for zone operations. Signed-off-by: Johannes Thumshirn --- include/uapi/linux/blktrace_api.h | 13 +++++++++++-- kernel/trace/blktrace.c | 25 +++++++++++++++++++++---- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 3a771b9802aa..925f78af939e 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -26,11 +26,20 @@ enum blktrace_cat { BLK_TC_DRV_DATA = 1 << 14, /* binary per-driver data */ BLK_TC_FUA = 1 << 15, /* fua requests */ - BLK_TC_END = 1 << 15, /* we've run out of bits! */ + BLK_TC_END_V1 = 1 << 15, /* we've run out of bits! */ + + BLK_TC_ZONE_APPEND = 1ull << 16, /* zone append */ + BLK_TC_ZONE_RESET = 1ull << 17, /* zone reset */ + BLK_TC_ZONE_RESET_ALL = 1ull << 18, /* zone reset all */ + BLK_TC_ZONE_FINISH = 1ull << 19, /* zone finish */ + BLK_TC_ZONE_OPEN = 1ull << 20, /* zone open */ + BLK_TC_ZONE_CLOSE = 1ull << 21, /* zone close */ + + BLK_TC_END_V2 = 1ull << 21, }; #define BLK_TC_SHIFT (16) -#define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT) +#define BLK_TC_ACT(act) ((u64)(act) << BLK_TC_SHIFT) /* * Basic trace actions diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8ffb218e9fb7..90b225c5bad7 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -163,8 +163,8 @@ static void relay_blktrace_event(struct blk_trace *bt, unsigned long sequence, bytes, what, error, cgid, cgid_len, pdu_data, pdu_len); return relay_blktrace_event1(bt, sequence, pid, cpu, sector, bytes, - lower_32_bits(what), error, cgid, cgid_len, - pdu_data, pdu_len); + what, error, cgid, cgid_len, pdu_data, + pdu_len); } /* @@ -342,6 +342,24 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, case REQ_OP_FLUSH: what |= BLK_TC_ACT(BLK_TC_FLUSH); break; + case REQ_OP_ZONE_APPEND: + what |= BLK_TC_ACT(BLK_TC_ZONE_APPEND); + break; + case REQ_OP_ZONE_RESET: + what |= BLK_TC_ACT(BLK_TC_ZONE_RESET); + break; + case REQ_OP_ZONE_RESET_ALL: + what |= BLK_TC_ACT(BLK_TC_ZONE_RESET_ALL); + break; + case REQ_OP_ZONE_FINISH: + what |= BLK_TC_ACT(BLK_TC_ZONE_FINISH); + break; + case REQ_OP_ZONE_OPEN: + what |= BLK_TC_ACT(BLK_TC_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + what |= BLK_TC_ACT(BLK_TC_ZONE_CLOSE); + break; default: break; } @@ -386,8 +404,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, sequence = per_cpu_ptr(bt->sequence, cpu); (*sequence)++; relay_blktrace_event(bt, *sequence, pid, cpu, sector, bytes, - lower_32_bits(what), error, cgid, cgid_len, - pdu_data, pdu_len); + what, error, cgid, cgid_len, pdu_data, pdu_len); local_irq_restore(flags); } -- 2.51.0 Expose ZONE APPEND completions as a block trace completion action to blktrace. As tracing of zoned block commands needs the upper 32bit of the widened 64bit action, only add traces to blktrace if user-space has requested version 2 of the blktrace protocol. Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn --- include/uapi/linux/blktrace_api.h | 3 +++ kernel/trace/blktrace.c | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 925f78af939e..bf41e34df3c8 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -97,6 +97,9 @@ enum blktrace_notify { #define BLK_TA_ABORT (__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE)) #define BLK_TA_DRV_DATA (__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA)) +#define BLK_TA_ZONE_APPEND (__BLK_TA_COMPLETE |\ + BLK_TC_ACT(BLK_TC_ZONE_APPEND)) + #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_MESSAGE (__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY)) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 90b225c5bad7..e8d562ad21f9 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -974,6 +974,22 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, blk_trace_request_get_cgid(rq)); } +static void blk_add_trace_zone_update_request(void *ignore, struct request *rq) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt) || bt->version < 2) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ZONE_APPEND, + blk_trace_request_get_cgid(rq)); +} + /** * blk_add_trace_bio - Add a trace for a bio oriented action * @q: queue the io is for @@ -1204,6 +1220,9 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); + ret = register_trace_blk_zone_append_update_request_bio( + blk_add_trace_zone_update_request, NULL); + WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); @@ -1223,6 +1242,8 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_blk_zone_append_update_request_bio( + blk_add_trace_zone_update_request, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); -- 2.51.0 Trace zone write plugging operations on block devices. As tracing of zoned block commands needs the upper 32bit of the widened 64bit action, only add traces to blktrace if user-space has requested version 2 of the blktrace protocol. Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn --- include/uapi/linux/blktrace_api.h | 5 ++++ kernel/trace/blktrace.c | 39 +++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index bf41e34df3c8..63de5fb07553 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -62,6 +62,8 @@ enum blktrace_act { __BLK_TA_REMAP, /* bio was remapped */ __BLK_TA_ABORT, /* request aborted */ __BLK_TA_DRV_DATA, /* driver-specific binary data */ + __BLK_TA_ZONE_PLUG, /* zone write plug was plugged */ + __BLK_TA_ZONE_UNPLUG, /* zone write plug was unplugged */ __BLK_TA_CGROUP = 1 << 8, /* from a cgroup*/ }; @@ -99,6 +101,9 @@ enum blktrace_notify { #define BLK_TA_ZONE_APPEND (__BLK_TA_COMPLETE |\ BLK_TC_ACT(BLK_TC_ZONE_APPEND)) +#define BLK_TA_ZONE_PLUG (__BLK_TA_ZONE_PLUG | BLK_TC_ACT(BLK_TC_QUEUE)) +#define BLK_TA_ZONE_UNPLUG (__BLK_TA_ZONE_UNPLUG |\ + BLK_TC_ACT(BLK_TC_QUEUE)) #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e8d562ad21f9..d659fed0650d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1080,6 +1080,37 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, rcu_read_unlock(); } +static void blk_add_trace_zone_plug(void *ignore, struct request_queue *q, + unsigned int zno, sector_t sector, + unsigned int sectors) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt && bt->version >= 2) + __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0, + BLK_TA_ZONE_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); + + return; +} + +static void blk_add_trace_zone_unplug(void *ignore, struct request_queue *q, + unsigned int zno, sector_t sector, + unsigned int sectors) +{ + struct blk_trace *bt; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (bt && bt->version >= 2) + __blk_add_trace(bt, sector, sectors << SECTOR_SHIFT, 0, + BLK_TA_ZONE_UNPLUG, 0, 0, NULL, 0); + rcu_read_unlock(); + return; +} + static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; @@ -1223,6 +1254,12 @@ static void blk_register_tracepoints(void) ret = register_trace_blk_zone_append_update_request_bio( blk_add_trace_zone_update_request, NULL); WARN_ON(ret); + ret = register_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, + NULL); + WARN_ON(ret); + ret = register_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, + NULL); + WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); @@ -1242,6 +1279,8 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_blk_zone_wplug_bio(blk_add_trace_zone_unplug, NULL); + unregister_trace_disk_zone_wplug_add_bio(blk_add_trace_zone_plug, NULL); unregister_trace_blk_zone_append_update_request_bio( blk_add_trace_zone_update_request, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); -- 2.51.0 Handle the BLKTRACESETUP2 ioctl, requesting an extended version of the blktrace protocol from user-space. Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn --- block/ioctl.c | 1 + kernel/trace/blktrace.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/block/ioctl.c b/block/ioctl.c index d7489a56b33c..3927ca4707d0 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -691,6 +691,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) /* Incompatible alignment on i386 */ case BLKTRACESETUP: + case BLKTRACESETUP2: return blk_trace_ioctl(bdev, cmd, argp); default: break; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d659fed0650d..0719f9b76082 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -745,6 +745,37 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } EXPORT_SYMBOL_GPL(blk_trace_setup); +static int blk_trace_setup2(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) +{ + struct blk_user_trace_setup2 buts2; + struct blk_trace *bt; + int ret; + + ret = copy_from_user(&buts2, arg, sizeof(buts2)); + if (ret) + return -EFAULT; + + if (!buts2.buf_size || !buts2.buf_nr) + return -EINVAL; + + mutex_lock(&q->debugfs_mutex); + bt = blk_trace_setup_prepare(q, name, dev, buts2.buf_size, buts2.buf_nr, + bdev); + if (IS_ERR(bt)) { + mutex_unlock(&q->debugfs_mutex); + return PTR_ERR(bt); + } + blk_trace_setup_finalize(q, name, 2, bt, &buts2); + mutex_unlock(&q->debugfs_mutex); + + if (copy_to_user(arg, &buts2, sizeof(buts2))) { + blk_trace_remove(q); + return -EFAULT; + } + return 0; +} + #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) static int compat_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, @@ -835,6 +866,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) char b[BDEVNAME_SIZE]; switch (cmd) { + case BLKTRACESETUP2: + snprintf(b, sizeof(b), "%pg", bdev); + ret = blk_trace_setup2(q, b, bdev->bd_dev, bdev, arg); + break; case BLKTRACESETUP: snprintf(b, sizeof(b), "%pg", bdev); ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); -- 2.51.0