From cf910e83ae23692fdeefc7e506e504c4c468d38a Mon Sep 17 00:00:00 2001 From: Seiji Aguchi Date: Thu, 20 Jun 2013 11:46:53 -0400 Subject: x86, trace: Add irq vector tracepoints [Purpose of this patch] As Vaibhav explained in the thread below, tracepoints for irq vectors are useful. http://www.spinics.net/lists/mm-commits/msg85707.html The current interrupt traces from irq_handler_entry and irq_handler_exit provide when an interrupt is handled. They provide good data about when the system has switched to kernel space and how it affects the currently running processes. There are some IRQ vectors which trigger the system into kernel space, which are not handled in generic IRQ handlers. Tracing such events gives us the information about IRQ interaction with other system events. The trace also tells where the system is spending its time. We want to know which cores are handling interrupts and how they are affecting other processes in the system. Also, the trace provides information about when the cores are idle and which interrupts are changing that state. On the other hand, my usecase is tracing just local timer event and getting a value of instruction pointer. I suggested to add an argument local timer event to get instruction pointer before. But there is another way to get it with external module like systemtap. So, I don't need to add any argument to irq vector tracepoints now. [Patch Description] Vaibhav's patch shared a trace point ,irq_vector_entry/irq_vector_exit, in all events. But there is an above use case to trace specific irq_vector rather than tracing all events. In this case, we are concerned about overhead due to unwanted events. So, add following tracepoints instead of introducing irq_vector_entry/exit. so that we can enable them independently. - local_timer_vector - reschedule_vector - call_function_vector - call_function_single_vector - irq_work_entry_vector - error_apic_vector - thermal_apic_vector - threshold_apic_vector - spurious_apic_vector - x86_platform_ipi_vector Also, introduce a logic switching IDT at enabling/disabling time so that a time penalty makes a zero when tracepoints are disabled. Detailed explanations are as follows. - Create trace irq handlers with entering_irq()/exiting_irq(). - Create a new IDT, trace_idt_table, at boot time by adding a logic to _set_gate(). It is just a copy of original idt table. - Register the new handlers for tracpoints to the new IDT by introducing macros to alloc_intr_gate() called at registering time of irq_vector handlers. - Add checking, whether irq vector tracing is on/off, into load_current_idt(). This has to be done below debug checking for these reasons. - Switching to debug IDT may be kicked while tracing is enabled. - On the other hands, switching to trace IDT is kicked only when debugging is disabled. In addition, the new IDT is created only when CONFIG_TRACING is enabled to avoid being used for other purposes. Signed-off-by: Seiji Aguchi Link: http://lkml.kernel.org/r/51C323ED.5050708@hds.com Signed-off-by: H. Peter Anvin Cc: Steven Rostedt --- arch/x86/kernel/tracepoint.c | 57 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 arch/x86/kernel/tracepoint.c (limited to 'arch/x86/kernel/tracepoint.c') diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c new file mode 100644 index 00000000000..1423efe98fb --- /dev/null +++ b/arch/x86/kernel/tracepoint.c @@ -0,0 +1,57 @@ +/* + * Code for supporting irq vector tracepoints. + * + * Copyright (C) 2013 Seiji Aguchi + * + */ +#include +#include +#include + +atomic_t trace_idt_ctr = ATOMIC_INIT(0); +struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) trace_idt_table }; + +#ifndef CONFIG_X86_64 +gate_desc trace_idt_table[NR_VECTORS] __page_aligned_data + = { { { { 0, 0 } } }, }; +#endif + +static int trace_irq_vector_refcount; +static DEFINE_MUTEX(irq_vector_mutex); + +static void set_trace_idt_ctr(int val) +{ + atomic_set(&trace_idt_ctr, val); + /* Ensure the trace_idt_ctr is set before sending IPI */ + wmb(); +} + +static void switch_idt(void *arg) +{ + load_current_idt(); +} + +void trace_irq_vector_regfunc(void) +{ + mutex_lock(&irq_vector_mutex); + if (!trace_irq_vector_refcount) { + set_trace_idt_ctr(1); + smp_call_function(switch_idt, NULL, 0); + switch_idt(NULL); + } + trace_irq_vector_refcount++; + mutex_unlock(&irq_vector_mutex); +} + +void trace_irq_vector_unregfunc(void) +{ + mutex_lock(&irq_vector_mutex); + trace_irq_vector_refcount--; + if (!trace_irq_vector_refcount) { + set_trace_idt_ctr(0); + smp_call_function(switch_idt, NULL, 0); + switch_idt(NULL); + } + mutex_unlock(&irq_vector_mutex); +} -- cgit v1.2.3-70-g09d2 From 2b4bc78956bdcc2bb4c49b3af955be776817e897 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Sat, 22 Jun 2013 13:16:19 -0400 Subject: trace,x86: Do not call local_irq_save() in load_current_idt() As load_current_idt() is now what is used to update the IDT for the switches needed for NMI, lockdep debug, and for tracing, it must not call local_irq_save(). This is because one of the users of this is lockdep, which does tracing of local_irq_save() and when the debug trap is hit, we need to update the IDT before tracing interrupts being disabled. As load_current_idt() is used to do this, calling local_irq_save() which lockdep traces, defeats the point of calling load_current_idt(). As interrupts are already disabled when used by lockdep and NMI, the only other user is tracing that can disable interrupts itself. Simply have the tracing update disable interrupts before calling load_current_idt() instead of breaking the other users. Here's the dump that happened: ------------[ cut here ]------------ WARNING: at /work/autotest/nobackup/linux-test.git/kernel/fork.c:1196 copy_process+0x2c3/0x1398() DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled) Modules linked in: CPU: 1 PID: 4570 Comm: gdm-simple-gree Not tainted 3.10.0-rc3-test+ #5 Hardware name: /DG965MQ, BIOS MQ96510J.86A.0372.2006.0605.1717 06/05/2006 ffffffff81d2a7a5 ffff88006ed13d50 ffffffff8192822b ffff88006ed13d90 ffffffff81035f25 ffff8800721c6000 ffff88006ed13da0 0000000001200011 0000000000000000 ffff88006ed5e000 ffff8800721c6000 ffff88006ed13df0 Call Trace: [] dump_stack+0x19/0x1b [] warn_slowpath_common+0x67/0x80 [] warn_slowpath_fmt+0x46/0x48 [] ? __raw_spin_lock_init+0x31/0x52 [] copy_process+0x2c3/0x1398 [] do_fork+0xa8/0x260 [] ? trace_preempt_on+0x2a/0x2f [] ? trace_hardirqs_on_thunk+0x3a/0x3f [] ? sysret_check+0x1b/0x56 [] ? sysret_check+0x1b/0x56 [] SyS_clone+0x16/0x18 [] stub_clone+0x69/0x90 [] ? system_call_fastpath+0x16/0x1b ---[ end trace 8b157a9d20ca1aa2 ]--- in fork.c: #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); <-- bug here DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif Cc: Seiji Aguchi Signed-off-by: Steven Rostedt --- arch/x86/include/asm/desc.h | 10 ++++------ arch/x86/kernel/tracepoint.c | 4 ++++ 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel/tracepoint.c') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 1377ecb29d8..b90e5dfeee4 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -497,21 +497,19 @@ static inline void load_trace_idt(void) #endif /* - * the load_current_idt() is called with interrupt disabled by local_irq_save() + * The load_current_idt() must be called with interrupts disabled * to avoid races. That way the IDT will always be set back to the expected - * descriptor. + * descriptor. It's also called when a CPU is being initialized, and + * that doesn't need to disable interrupts, as nothing should be + * bothering the CPU then. */ static inline void load_current_idt(void) { - unsigned long flags; - - local_irq_save(flags); if (is_debug_idt_enabled()) load_debug_idt(); else if (is_trace_idt_enabled()) load_trace_idt(); else load_idt((const struct desc_ptr *)&idt_descr); - local_irq_restore(flags); } #endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 1423efe98fb..4e584a8d6ed 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -29,7 +29,11 @@ static void set_trace_idt_ctr(int val) static void switch_idt(void *arg) { + unsigned long flags; + + local_irq_save(flags); load_current_idt(); + local_irq_restore(flags); } void trace_irq_vector_regfunc(void) -- cgit v1.2.3-70-g09d2 From 4df05f361937ee86e5a8c9ead8aeb6a19ea9b7d7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 16 Jul 2013 11:34:41 -0700 Subject: x86: Make sure IDT is page aligned Since the IDT is referenced from a fixmap, make sure it is page aligned. Merge with 32-bit one, since it was already aligned to deal with F00F bug. Since bss is cleared before IDT setup, it can live there. This also moves the other *_idt_table variables into common locations. This avoids the risk of the IDT ever being moved in the bss and having the mapping be offset, resulting in calling incorrect handlers. In the current upstream kernel this is not a manifested bug, but heavily patched kernels (such as those using the PaX patch series) did encounter this bug. The tables other than idt_table technically do not need to be page aligned, at least not at the current time, but using a common declaration avoids mistakes. On 64 bits the table is exactly one page long, anyway. Signed-off-by: Kees Cook Link: http://lkml.kernel.org/r/20130716183441.GA14232@www.outflux.net Reported-by: PaX Team Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_64.S | 15 --------------- arch/x86/kernel/tracepoint.c | 6 ++---- arch/x86/kernel/traps.c | 12 ++++++------ 3 files changed, 8 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel/tracepoint.c') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 5e4d8a8a5c4..e1aabdb314c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -512,21 +512,6 @@ ENTRY(phys_base) #include "../../x86/xen/xen-head.S" - .section .bss, "aw", @nobits - .align L1_CACHE_BYTES -ENTRY(idt_table) - .skip IDT_ENTRIES * 16 - - .align L1_CACHE_BYTES -ENTRY(debug_idt_table) - .skip IDT_ENTRIES * 16 - -#ifdef CONFIG_TRACING - .align L1_CACHE_BYTES -ENTRY(trace_idt_table) - .skip IDT_ENTRIES * 16 -#endif - __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c index 4e584a8d6ed..1c113db9ed5 100644 --- a/arch/x86/kernel/tracepoint.c +++ b/arch/x86/kernel/tracepoint.c @@ -12,10 +12,8 @@ atomic_t trace_idt_ctr = ATOMIC_INIT(0); struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) trace_idt_table }; -#ifndef CONFIG_X86_64 -gate_desc trace_idt_table[NR_VECTORS] __page_aligned_data - = { { { { 0, 0 } } }, }; -#endif +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; static int trace_irq_vector_refcount; static DEFINE_MUTEX(irq_vector_mutex); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b0865e88d3c..1b23a1c9274 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -63,19 +63,19 @@ #include #include #include + +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; #else #include #include asmlinkage int system_call(void); - -/* - * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround. - */ -gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; #endif +/* Must be page-aligned because the real IDT is used in a fixmap. */ +gate_desc idt_table[NR_VECTORS] __page_aligned_bss; + DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); -- cgit v1.2.3-70-g09d2