From 72c9ddfd4c5bf54ef03cfdf57026416cb678eeba Mon Sep 17 00:00:00 2001 From: David Miller Date: Tue, 20 Apr 2010 15:47:11 -0700 Subject: ring-buffer: Make non-consuming read less expensive with lots of cpus. When performing a non-consuming read, a synchronize_sched() is performed once for every cpu which is actively tracing. This is very expensive, and can make it take several seconds to open up the 'trace' file with lots of cpus. Only one synchronize_sched() call is actually necessary. What is desired is for all cpus to see the disabling state change. So we transform the existing sequence: for_each_cpu() { ring_buffer_read_start(); } where each ring_buffer_start() call performs a synchronize_sched(), into the following: for_each_cpu() { ring_buffer_read_prepare(); } ring_buffer_read_prepare_sync(); for_each_cpu() { ring_buffer_read_start(); } wherein only the single ring_buffer_read_prepare_sync() call needs to do the synchronize_sched(). The first phase, via ring_buffer_read_prepare(), allocates the 'iter' memory and increments ->record_disabled. In the second phase, ring_buffer_read_prepare_sync() makes sure this ->record_disabled state is visible fully to all cpus. And in the final third phase, the ring_buffer_read_start() calls reset the 'iter' objects allocated in the first phase since we now know that none of the cpus are adding trace entries any more. This makes openning the 'trace' file nearly instantaneous on a sparc64 Niagara2 box with 128 cpus tracing. Signed-off-by: David S. Miller LKML-Reference: <20100420.154711.11246950.davem@davemloft.net> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 64 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 10 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5885cdfc41f..2a090448ef6 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3332,23 +3332,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, EXPORT_SYMBOL_GPL(ring_buffer_consume); /** - * ring_buffer_read_start - start a non consuming read of the buffer + * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * - * This starts up an iteration through the buffer. It also disables - * the recording to the buffer until the reading is finished. - * This prevents the reading from being corrupted. This is not - * a consuming read, so a producer is not expected. + * This performs the initial preparations necessary to iterate + * through the buffer. Memory is allocated, buffer recording + * is disabled, and the iterator pointer is returned to the caller. * - * Must be paired with ring_buffer_finish. + * Disabling buffer recordng prevents the reading from being + * corrupted. This is not a consuming read, so a producer is not + * expected. + * + * After a sequence of ring_buffer_read_prepare calls, the user is + * expected to make at least one call to ring_buffer_prepare_sync. + * Afterwards, ring_buffer_read_start is invoked to get things going + * for real. + * + * This overall must be paired with ring_buffer_finish. */ struct ring_buffer_iter * -ring_buffer_read_start(struct ring_buffer *buffer, int cpu) +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; - unsigned long flags; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; @@ -3362,15 +3369,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu) iter->cpu_buffer = cpu_buffer; atomic_inc(&cpu_buffer->record_disabled); + + return iter; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); + +/** + * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls + * + * All previously invoked ring_buffer_read_prepare calls to prepare + * iterators will be synchronized. Afterwards, read_buffer_read_start + * calls on those iterators are allowed. + */ +void +ring_buffer_read_prepare_sync(void) +{ synchronize_sched(); +} +EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); + +/** + * ring_buffer_read_start - start a non consuming read of the buffer + * @iter: The iterator returned by ring_buffer_read_prepare + * + * This finalizes the startup of an iteration through the buffer. + * The iterator comes from a call to ring_buffer_read_prepare and + * an intervening ring_buffer_read_prepare_sync must have been + * performed. + * + * Must be paired with ring_buffer_finish. + */ +void +ring_buffer_read_start(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + + if (!iter) + return; + + cpu_buffer = iter->cpu_buffer; spin_lock_irqsave(&cpu_buffer->reader_lock, flags); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - - return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); -- cgit v1.2.3-70-g09d2 From 956097912c40a03bf22603a3be73503fd9ea9e44 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 2 May 2010 08:03:54 +0200 Subject: ring-buffer: Wrap open-coded WARN_ONCE Wrap open-coded WARN_ONCE functionality into the equivalent macro. Signed-off-by: Borislav Petkov LKML-Reference: <20100502060354.GA5281@liondog.tnic> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2a090448ef6..7f6059c5aa9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2000,17 +2000,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, u64 *delta) { struct ring_buffer_event *event; - static int once; int ret; - if (unlikely(*delta > (1ULL << 59) && !once++)) { - printk(KERN_WARNING "Delta way too big! %llu" - " ts=%llu write stamp = %llu\n", - (unsigned long long)*delta, - (unsigned long long)*ts, - (unsigned long long)cpu_buffer->write_stamp); - WARN_ON(1); - } + WARN_ONCE(*delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + (unsigned long long)*delta, + (unsigned long long)*ts, + (unsigned long long)cpu_buffer->write_stamp); /* * The delta is too big, we to add a -- cgit v1.2.3-70-g09d2 From b3230c8b44da5838cf396942d5c1ab19f8e8f720 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 21 May 2010 11:55:21 -0400 Subject: ring-buffer: Reset "real_end" when page is filled The code to store the "lost events" requires knowing the real end of the page. Since the 'commit' includes the padding at the end of a page a "real_end" variable was used to keep track of the end not including the padding. If events were lost, the reader can place the count of events in the padded area if there is enough room. The bug this patch fixes is that when we fill the page we do not reset the real_end variable, and if the writer had wrapped a few times, the real_end would be incorrect. This patch simply resets the real_end if the page was filled. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7f6059c5aa9..b0702ff7821 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1768,6 +1768,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * must fill the old tail_page with padding. */ if (tail >= BUF_PAGE_SIZE) { + /* + * If the page was filled, then we still need + * to update the real_end. Reset it to zero + * and the reader will ignore it. + */ + if (tail == BUF_PAGE_SIZE) + tail_page->real_end = 0; + local_sub(length, &tail_page->write); return; } -- cgit v1.2.3-70-g09d2 From 2711ca237a084286ea1c2dcf82ab2aadab23a00d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 21 May 2010 13:32:26 -0400 Subject: ring-buffer: Move zeroing out excess in page to ring buffer code Currently the trace splice code zeros out the excess bytes in the page before sending it off to userspace. This is to make sure userspace is not getting anything it should not be when reading the pages, because the excess data was never initialized to zero before writing (for perfomance reasons). But the splice code has no business in doing this work, it should be done by the ring buffer. With the latest changes for recording lost events, the splice code gets it wrong anyway. Move the zeroing out of excess bytes into the ring buffer code. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 +++++++++-- kernel/trace/trace.c | 6 ------ 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel/trace/ring_buffer.c') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b0702ff7821..1da7b6ea8b8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3902,12 +3902,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer, ret = read; cpu_buffer->lost_events = 0; + + commit = local_read(&bpage->commit); /* * Set a flag in the commit field if we lost events */ if (missed_events) { - commit = local_read(&bpage->commit); - /* If there is room at the end of the page to save the * missed events, then record it there. */ @@ -3915,10 +3915,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer, memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); + commit += sizeof(missed_events); } local_add(RB_MISSED_EVENTS, &bpage->commit); } + /* + * This page may be off to user land. Zero it out here. + */ + if (commit < BUF_PAGE_SIZE) + memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); + out_unlock: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ba0ec81158b..95d0b1a28f9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3661,7 +3661,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; - unsigned int pos; ssize_t ret; size_t size; @@ -3688,11 +3687,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) return 0; - pos = ring_buffer_page_len(info->spare); - - if (pos < PAGE_SIZE) - memset(info->spare + pos, 0, PAGE_SIZE - pos); - read: size = PAGE_SIZE - info->read; if (size > count) -- cgit v1.2.3-70-g09d2