diff options
-rw-r--r-- | arch/Kconfig | 2 | ||||
-rw-r--r-- | arch/x86/oprofile/op_model_amd.c | 224 | ||||
-rw-r--r-- | drivers/oprofile/buffer_sync.c | 229 | ||||
-rw-r--r-- | drivers/oprofile/cpu_buffer.c | 393 | ||||
-rw-r--r-- | drivers/oprofile/cpu_buffer.h | 72 | ||||
-rw-r--r-- | drivers/oprofile/event_buffer.c | 4 | ||||
-rw-r--r-- | drivers/oprofile/oprof.c | 4 | ||||
-rw-r--r-- | drivers/oprofile/oprof.h | 8 | ||||
-rw-r--r-- | drivers/oprofile/oprofile_files.c | 24 | ||||
-rw-r--r-- | include/linux/oprofile.h | 21 | ||||
-rw-r--r-- | include/linux/ring_buffer.h | 2 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 44 | ||||
-rw-r--r-- | kernel/trace/trace.c | 4 |
13 files changed, 566 insertions, 465 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 471e72dbaf8..2e13aa26192 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -6,6 +6,8 @@ config OPROFILE tristate "OProfile system profiling (EXPERIMENTAL)" depends on PROFILING depends on HAVE_OPROFILE + select TRACING + select RING_BUFFER help OProfile is a profiling system capable of profiling the whole system, include the kernel, kernel modules, libraries, diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 509513760a6..8fdf06e4edf 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -2,7 +2,7 @@ * @file op_model_amd.c * athlon / K7 / K8 / Family 10h model-specific MSR operations * - * @remark Copyright 2002-2008 OProfile authors + * @remark Copyright 2002-2009 OProfile authors * @remark Read the file COPYING * * @author John Levon @@ -10,7 +10,7 @@ * @author Graydon Hoare * @author Robert Richter <robert.richter@amd.com> * @author Barry Kasindorf -*/ + */ #include <linux/oprofile.h> #include <linux/device.h> @@ -60,56 +60,10 @@ static unsigned long reset_value[NUM_COUNTERS]; #define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ #define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ -/* Codes used in cpu_buffer.c */ -/* This produces duplicate code, need to be fixed */ -#define IBS_FETCH_BEGIN 3 -#define IBS_OP_BEGIN 4 - -/* The function interface needs to be fixed, something like add - data. Should then be added to linux/oprofile.h. */ -extern void -oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int *const ibs_sample, int ibs_code); - -struct ibs_fetch_sample { - /* MSRC001_1031 IBS Fetch Linear Address Register */ - unsigned int ibs_fetch_lin_addr_low; - unsigned int ibs_fetch_lin_addr_high; - /* MSRC001_1030 IBS Fetch Control Register */ - unsigned int ibs_fetch_ctl_low; - unsigned int ibs_fetch_ctl_high; - /* MSRC001_1032 IBS Fetch Physical Address Register */ - unsigned int ibs_fetch_phys_addr_low; - unsigned int ibs_fetch_phys_addr_high; -}; - -struct ibs_op_sample { - /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */ - unsigned int ibs_op_rip_low; - unsigned int ibs_op_rip_high; - /* MSRC001_1035 IBS Op Data Register */ - unsigned int ibs_op_data1_low; - unsigned int ibs_op_data1_high; - /* MSRC001_1036 IBS Op Data 2 Register */ - unsigned int ibs_op_data2_low; - unsigned int ibs_op_data2_high; - /* MSRC001_1037 IBS Op Data 3 Register */ - unsigned int ibs_op_data3_low; - unsigned int ibs_op_data3_high; - /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */ - unsigned int ibs_dc_linear_low; - unsigned int ibs_dc_linear_high; - /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */ - unsigned int ibs_dc_phys_low; - unsigned int ibs_dc_phys_high; -}; - -/* - * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ -*/ -static void clear_ibs_nmi(void); +#define IBS_FETCH_SIZE 6 +#define IBS_OP_SIZE 12 -static int ibs_allowed; /* AMD Family10h and later */ +static int has_ibs; /* AMD Family10h and later */ struct op_ibs_config { unsigned long op_enabled; @@ -200,31 +154,29 @@ static inline int op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) { - unsigned int low, high; - struct ibs_fetch_sample ibs_fetch; - struct ibs_op_sample ibs_op; + u32 low, high; + u64 msr; + struct op_entry entry; - if (!ibs_allowed) + if (!has_ibs) return 1; if (ibs_config.fetch_enabled) { rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); if (high & IBS_FETCH_HIGH_VALID_BIT) { - ibs_fetch.ibs_fetch_ctl_high = high; - ibs_fetch.ibs_fetch_ctl_low = low; - rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); - ibs_fetch.ibs_fetch_lin_addr_high = high; - ibs_fetch.ibs_fetch_lin_addr_low = low; - rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high); - ibs_fetch.ibs_fetch_phys_addr_high = high; - ibs_fetch.ibs_fetch_phys_addr_low = low; - - oprofile_add_ibs_sample(regs, - (unsigned int *)&ibs_fetch, - IBS_FETCH_BEGIN); - - /*reenable the IRQ */ - rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr); + oprofile_write_reserve(&entry, regs, msr, + IBS_FETCH_CODE, IBS_FETCH_SIZE); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_add_data(&entry, low); + oprofile_add_data(&entry, high); + rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_write_commit(&entry); + + /* reenable the IRQ */ high &= ~IBS_FETCH_HIGH_VALID_BIT; high |= IBS_FETCH_HIGH_ENABLE; low &= IBS_FETCH_LOW_MAX_CNT_MASK; @@ -235,30 +187,29 @@ op_amd_handle_ibs(struct pt_regs * const regs, if (ibs_config.op_enabled) { rdmsr(MSR_AMD64_IBSOPCTL, low, high); if (low & IBS_OP_LOW_VALID_BIT) { - rdmsr(MSR_AMD64_IBSOPRIP, low, high); - ibs_op.ibs_op_rip_low = low; - ibs_op.ibs_op_rip_high = high; - rdmsr(MSR_AMD64_IBSOPDATA, low, high); - ibs_op.ibs_op_data1_low = low; - ibs_op.ibs_op_data1_high = high; - rdmsr(MSR_AMD64_IBSOPDATA2, low, high); - ibs_op.ibs_op_data2_low = low; - ibs_op.ibs_op_data2_high = high; - rdmsr(MSR_AMD64_IBSOPDATA3, low, high); - ibs_op.ibs_op_data3_low = low; - ibs_op.ibs_op_data3_high = high; - rdmsr(MSR_AMD64_IBSDCLINAD, low, high); - ibs_op.ibs_dc_linear_low = low; - ibs_op.ibs_dc_linear_high = high; - rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high); - ibs_op.ibs_dc_phys_low = low; - ibs_op.ibs_dc_phys_high = high; + rdmsrl(MSR_AMD64_IBSOPRIP, msr); + oprofile_write_reserve(&entry, regs, msr, + IBS_OP_CODE, IBS_OP_SIZE); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + rdmsrl(MSR_AMD64_IBSOPDATA, msr); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + rdmsrl(MSR_AMD64_IBSOPDATA2, msr); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + rdmsrl(MSR_AMD64_IBSOPDATA3, msr); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + rdmsrl(MSR_AMD64_IBSDCLINAD, msr); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_write_commit(&entry); /* reenable the IRQ */ - oprofile_add_ibs_sample(regs, - (unsigned int *)&ibs_op, - IBS_OP_BEGIN); - rdmsr(MSR_AMD64_IBSOPCTL, low, high); high = 0; low &= ~IBS_OP_LOW_VALID_BIT; low |= IBS_OP_LOW_ENABLE; @@ -308,14 +259,14 @@ static void op_amd_start(struct op_msrs const * const msrs) } #ifdef CONFIG_OPROFILE_IBS - if (ibs_allowed && ibs_config.fetch_enabled) { + if (has_ibs && ibs_config.fetch_enabled) { low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ + IBS_FETCH_HIGH_ENABLE; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } - if (ibs_allowed && ibs_config.op_enabled) { + if (has_ibs && ibs_config.op_enabled) { low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ + IBS_OP_LOW_ENABLE; @@ -331,8 +282,10 @@ static void op_amd_stop(struct op_msrs const * const msrs) unsigned int low, high; int i; - /* Subtle: stop on all counters to avoid race with - * setting our pm callback */ + /* + * Subtle: stop on all counters to avoid race with setting our + * pm callback + */ for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (!reset_value[i]) continue; @@ -342,14 +295,16 @@ static void op_amd_stop(struct op_msrs const * const msrs) } #ifdef CONFIG_OPROFILE_IBS - if (ibs_allowed && ibs_config.fetch_enabled) { - low = 0; /* clear max count and enable */ + if (has_ibs && ibs_config.fetch_enabled) { + /* clear max count and enable */ + low = 0; high = 0; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } - if (ibs_allowed && ibs_config.op_enabled) { - low = 0; /* clear max count and enable */ + if (has_ibs && ibs_config.op_enabled) { + /* clear max count and enable */ + low = 0; high = 0; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } @@ -370,18 +325,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } -#ifndef CONFIG_OPROFILE_IBS - -/* no IBS support */ - -static int op_amd_init(struct oprofile_operations *ops) -{ - return 0; -} - -static void op_amd_exit(void) {} - -#else +#ifdef CONFIG_OPROFILE_IBS static u8 ibs_eilvt_off; @@ -395,7 +339,7 @@ static inline void apic_clear_ibs_nmi_per_cpu(void *arg) setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); } -static int pfm_amd64_setup_eilvt(void) +static int init_ibs_nmi(void) { #define IBSCTL_LVTOFFSETVAL (1 << 8) #define IBSCTL 0x1cc @@ -419,6 +363,7 @@ static int pfm_amd64_setup_eilvt(void) | IBSCTL_LVTOFFSETVAL); pci_read_config_dword(cpu_cfg, IBSCTL, &value); if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { + pci_dev_put(cpu_cfg); printk(KERN_DEBUG "Failed to setup IBS LVT offset, " "IBSCTL = 0x%08x", value); return 1; @@ -443,33 +388,35 @@ static int pfm_amd64_setup_eilvt(void) return 0; } -/* - * initialize the APIC for the IBS interrupts - * if available (AMD Family10h rev B0 and later) - */ -static void setup_ibs(void) +/* uninitialize the APIC for the IBS interrupts if needed */ +static void clear_ibs_nmi(void) { - ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); + if (has_ibs) + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); +} + +/* initialize the APIC for the IBS interrupts if available */ +static void ibs_init(void) +{ + has_ibs = boot_cpu_has(X86_FEATURE_IBS); - if (!ibs_allowed) + if (!has_ibs) return; - if (pfm_amd64_setup_eilvt()) { - ibs_allowed = 0; + if (init_ibs_nmi()) { + has_ibs = 0; return; } printk(KERN_INFO "oprofile: AMD IBS detected\n"); } - -/* - * unitialize the APIC for the IBS interrupts if needed on AMD Family10h - * rev B0 and later */ -static void clear_ibs_nmi(void) +static void ibs_exit(void) { - if (ibs_allowed) - on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); + if (!has_ibs) + return; + + clear_ibs_nmi(); } static int (*create_arch_files)(struct super_block *sb, struct dentry *root); @@ -486,7 +433,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) if (ret) return ret; - if (!ibs_allowed) + if (!has_ibs) return ret; /* model specific files */ @@ -519,7 +466,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) static int op_amd_init(struct oprofile_operations *ops) { - setup_ibs(); + ibs_init(); create_arch_files = ops->create_files; ops->create_files = setup_ibs_files; return 0; @@ -527,10 +474,21 @@ static int op_amd_init(struct oprofile_operations *ops) static void op_amd_exit(void) { - clear_ibs_nmi(); + ibs_exit(); } -#endif +#else + +/* no IBS support */ + +static int op_amd_init(struct oprofile_operations *ops) +{ + return 0; +} + +static void op_amd_exit(void) {} + +#endif /* CONFIG_OPROFILE_IBS */ struct op_x86_model_spec const op_amd_spec = { .init = op_amd_init, diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index b55cd23ffde..ac014cb2791 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -1,11 +1,12 @@ /** * @file buffer_sync.c * - * @remark Copyright 2002 OProfile authors + * @remark Copyright 2002-2009 OProfile authors * @remark Read the file COPYING * * @author John Levon <levon@movementarian.org> * @author Barry Kasindorf + * @author Robert Richter <robert.richter@amd.com> * * This is the core of the buffer management. Each * CPU buffer is processed and entered into the @@ -268,18 +269,6 @@ lookup_dcookie(struct mm_struct *mm, unsigned long addr, off_t *offset) return cookie; } -static void increment_tail(struct oprofile_cpu_buffer *b) -{ - unsigned long new_tail = b->tail_pos + 1; - - rmb(); /* be sure fifo pointers are synchromized */ - - if (new_tail < b->buffer_size) - b->tail_pos = new_tail; - else - b->tail_pos = 0; -} - static unsigned long last_cookie = INVALID_COOKIE; static void add_cpu_switch(int i) @@ -327,84 +316,73 @@ static void add_trace_begin(void) add_event_entry(TRACE_BEGIN_CODE); } -#ifdef CONFIG_OPROFILE_IBS - -#define IBS_FETCH_CODE_SIZE 2 -#define IBS_OP_CODE_SIZE 5 -#define IBS_EIP(offset) \ - (((struct op_sample *)&cpu_buf->buffer[(offset)])->eip) -#define IBS_EVENT(offset) \ - (((struct op_sample *)&cpu_buf->buffer[(offset)])->event) - -/* - * Add IBS fetch and op entries to event buffer - */ -static void add_ibs_begin(struct oprofile_cpu_buffer *cpu_buf, int code, - struct mm_struct *mm) +static void add_data(struct op_entry *entry, struct mm_struct *mm) { - unsigned long rip; - int i, count; - unsigned long ibs_cookie = 0; + unsigned long code, pc, val; + unsigned long cookie; off_t offset; - increment_tail(cpu_buf); /* move to RIP entry */ - - rip = IBS_EIP(cpu_buf->tail_pos); - -#ifdef __LP64__ - rip += IBS_EVENT(cpu_buf->tail_pos) << 32; -#endif + if (!op_cpu_buffer_get_data(entry, &code)) + return; + if (!op_cpu_buffer_get_data(entry, &pc)) + return; + if (!op_cpu_buffer_get_size(entry)) + return; if (mm) { - ibs_cookie = lookup_dcookie(mm, rip, &offset); + cookie = lookup_dcookie(mm, pc, &offset); - if (ibs_cookie == NO_COOKIE) - offset = rip; - if (ibs_cookie == INVALID_COOKIE) { + if (cookie == NO_COOKIE) + offset = pc; + if (cookie == INVALID_COOKIE) { atomic_inc(&oprofile_stats.sample_lost_no_mapping); - offset = rip; + offset = pc; } - if (ibs_cookie != last_cookie) { - add_cookie_switch(ibs_cookie); - last_cookie = ibs_cookie; + if (cookie != last_cookie) { + add_cookie_switch(cookie); + last_cookie = cookie; } } else - offset = rip; + offset = pc; add_event_entry(ESCAPE_CODE); add_event_entry(code); add_event_entry(offset); /* Offset from Dcookie */ - /* we send the Dcookie offset, but send the raw Linear Add also*/ - add_event_entry(IBS_EIP(cpu_buf->tail_pos)); - add_event_entry(IBS_EVENT(cpu_buf->tail_pos)); - - if (code == IBS_FETCH_CODE) - count = IBS_FETCH_CODE_SIZE; /*IBS FETCH is 2 int64s*/ - else - count = IBS_OP_CODE_SIZE; /*IBS OP is 5 int64s*/ - - for (i = 0; i < count; i++) { - increment_tail(cpu_buf); - add_event_entry(IBS_EIP(cpu_buf->tail_pos)); - add_event_entry(IBS_EVENT(cpu_buf->tail_pos)); - } + while (op_cpu_buffer_get_data(entry, &val)) + add_event_entry(val); } -#endif - -static void add_sample_entry(unsigned long offset, unsigned long event) +static inline void add_sample_entry(unsigned long offset, unsigned long event) { add_event_entry(offset); add_event_entry(event); } -static int add_us_sample(struct mm_struct *mm, struct op_sample *s) +/* + * Add a sample to the global event buffer. If possible the + * sample is converted into a persistent dentry/offset pair + * for later lookup from userspace. Return 0 on failure. + */ +static int +add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel) { unsigned long cookie; off_t offset; + if (in_kernel) { + add_sample_entry(s->eip, s->event); + return 1; + } + + /* add userspace sample */ + + if (!mm) { + atomic_inc(&oprofile_stats.sample_lost_no_mm); + return 0; + } + cookie = lookup_dcookie(mm, s->eip, &offset); if (cookie == INVALID_COOKIE) { @@ -423,25 +401,6 @@ static int add_us_sample(struct mm_struct *mm, struct op_sample *s) } -/* Add a sample to the global event buffer. If possible the - * sample is converted into a persistent dentry/offset pair - * for later lookup from userspace. - */ -static int -add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel) -{ - if (in_kernel) { - add_sample_entry(s->eip, s->event); - return 1; - } else if (mm) { - return add_us_sample(mm, s); - } else { - atomic_inc(&oprofile_stats.sample_lost_no_mm); - } - return 0; -} - - static void release_mm(struct mm_struct *mm) { if (!mm) @@ -466,33 +425,6 @@ static inline int is_code(unsigned long val) } -/* "acquire" as many cpu buffer slots as we can */ -static unsigned long get_slots(struct oprofile_cpu_buffer *b) -{ - unsigned long head = b->head_pos; - unsigned long tail = b->tail_pos; - - /* - * Subtle. This resets the persistent last_task - * and in_kernel values used for switching notes. - * BUT, there is a small window between reading - * head_pos, and this call, that means samples - * can appear at the new head position, but not - * be prefixed with the notes for switching - * kernel mode or a task switch. This small hole - * can lead to mis-attribution or samples where - * we don't know if it's in the kernel or not, - * at the start of an event buffer. - */ - cpu_buffer_reset(b); - - if (head >= tail) - return head - tail; - - return head + (b->buffer_size - tail); -} - - /* Move tasks along towards death. Any tasks on dead_tasks * will definitely have no remaining references in any * CPU buffers at this point, because we use two lists, @@ -559,71 +491,72 @@ typedef enum { */ void sync_buffer(int cpu) { - struct oprofile_cpu_buffer *cpu_buf = &per_cpu(cpu_buffer, cpu); struct mm_struct *mm = NULL; + struct mm_struct *oldmm; + unsigned long val; struct task_struct *new; unsigned long cookie = 0; int in_kernel = 1; sync_buffer_state state = sb_buffer_start; -#ifndef CONFIG_OPROFILE_IBS unsigned int i; unsigned long available; -#endif + unsigned long flags; + struct op_entry entry; + struct op_sample *sample; mutex_lock(&buffer_mutex); add_cpu_switch(cpu); - /* Remember, only we can modify tail_pos */ - -#ifndef CONFIG_OPROFILE_IBS - available = get_slots(cpu_buf); + op_cpu_buffer_reset(cpu); + available = op_cpu_buffer_entries(cpu); for (i = 0; i < available; ++i) { -#else - while (get_slots(cpu_buf)) { -#endif - struct op_sample *s = &cpu_buf->buffer[cpu_buf->tail_pos]; + sample = op_cpu_buffer_read_entry(&entry, cpu); + if (!sample) + break; - if (is_code(s->eip)) { - if (s->event <= CPU_IS_KERNEL) { + if (is_code(sample->eip)) { + flags = sample->event; + if (flags & TRACE_BEGIN) { + state = sb_bt_start; + add_trace_begin(); + } + if (flags & KERNEL_CTX_SWITCH) { /* kernel/userspace switch */ - in_kernel = s->event; + in_kernel = flags & IS_KERNEL; if (state == sb_buffer_start) state = sb_sample_start; - add_kernel_ctx_switch(s->event); - } else if (s->event == CPU_TRACE_BEGIN) { - state = sb_bt_start; - add_trace_begin(); -#ifdef CONFIG_OPROFILE_IBS - } else if (s->event == IBS_FETCH_BEGIN) { - state = sb_bt_start; - add_ibs_begin(cpu_buf, IBS_FETCH_CODE, mm); - } else if (s->event == IBS_OP_BEGIN) { - state = sb_bt_start; - add_ibs_begin(cpu_buf, IBS_OP_CODE, mm); -#endif - } else { - struct mm_struct *oldmm = mm; - + add_kernel_ctx_switch(flags & IS_KERNEL); + } + if (flags & USER_CTX_SWITCH + && op_cpu_buffer_get_data(&entry, &val)) { /* userspace context switch */ - new = (struct task_struct *)s->event; - + new = (struct task_struct *)val; + oldmm = mm; release_mm(oldmm); mm = take_tasks_mm(new); if (mm != oldmm) cookie = get_exec_dcookie(mm); add_user_ctx_switch(new, cookie); } - } else if (state >= sb_bt_start && - !add_sample(mm, s, in_kernel)) { - if (state == sb_bt_start) { - state = sb_bt_ignore; - atomic_inc(&oprofile_stats.bt_lost_no_mapping); - } + if (op_cpu_buffer_get_size(&entry)) + add_data(&entry, mm); + continue; } - increment_tail(cpu_buf); + if (state < sb_bt_start) + /* ignore sample */ + continue; + + if (add_sample(mm, sample, in_kernel)) + continue; + + /* ignore backtraces if failed to add a sample */ + if (state == sb_bt_start) { + state = sb_bt_ignore; + atomic_inc(&oprofile_stats.bt_lost_no_mapping); + } } release_mm(mm); diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 01d38e78cde..2e03b6d796d 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -1,11 +1,12 @@ /** * @file cpu_buffer.c * - * @remark Copyright 2002 OProfile authors + * @remark Copyright 2002-2009 OProfile authors * @remark Read the file COPYING * * @author John Levon <levon@movementarian.org> * @author Barry Kasindorf <barry.kasindorf@amd.com> + * @author Robert Richter <robert.richter@amd.com> * * Each CPU has a local buffer that stores PC value/event * pairs. We also log context switches when we notice them. @@ -28,6 +29,25 @@ #include "buffer_sync.h" #include "oprof.h" +#define OP_BUFFER_FLAGS 0 + +/* + * Read and write access is using spin locking. Thus, writing to the + * buffer by NMI handler (x86) could occur also during critical + * sections when reading the buffer. To avoid this, there are 2 + * buffers for independent read and write access. Read access is in + * process context only, write access only in the NMI handler. If the + * read buffer runs empty, both buffers are swapped atomically. There + * is potentially a small window during swapping where the buffers are + * disabled and samples could be lost. + * + * Using 2 buffers is a little bit overhead, but the solution is clear + * and does not require changes in the ring buffer implementation. It + * can be changed to a single buffer solution when the ring buffer + * access is implemented as non-locking atomic code. + */ +static struct ring_buffer *op_ring_buffer_read; +static struct ring_buffer *op_ring_buffer_write; DEFINE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer); static void wq_sync_buffer(struct work_struct *work); @@ -35,19 +55,9 @@ static void wq_sync_buffer(struct work_struct *work); #define DEFAULT_TIMER_EXPIRE (HZ / 10) static int work_enabled; -void free_cpu_buffers(void) -{ - int i; - - for_each_possible_cpu(i) { - vfree(per_cpu(cpu_buffer, i).buffer); - per_cpu(cpu_buffer, i).buffer = NULL; - } -} - unsigned long oprofile_get_cpu_buffer_size(void) { - return fs_cpu_buffer_size; + return oprofile_cpu_buffer_size; } void oprofile_cpu_buffer_inc_smpl_lost(void) @@ -58,26 +68,36 @@ void oprofile_cpu_buffer_inc_smpl_lost(void) cpu_buf->sample_lost_overflow++; } +void free_cpu_buffers(void) +{ + if (op_ring_buffer_read) + ring_buffer_free(op_ring_buffer_read); + op_ring_buffer_read = NULL; + if (op_ring_buffer_write) + ring_buffer_free(op_ring_buffer_write); + op_ring_buffer_write = NULL; +} + int alloc_cpu_buffers(void) { int i; - unsigned long buffer_size = fs_cpu_buffer_size; + unsigned long buffer_size = oprofile_cpu_buffer_size; + + op_ring_buffer_read = ring_buffer_alloc(buffer_size, OP_BUFFER_FLAGS); + if (!op_ring_buffer_read) + goto fail; + op_ring_buffer_write = ring_buffer_alloc(buffer_size, OP_BUFFER_FLAGS); + if (!op_ring_buffer_write) + goto fail; for_each_possible_cpu(i) { struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i); - b->buffer = vmalloc_node(sizeof(struct op_sample) * buffer_size, - cpu_to_node(i)); - if (!b->buffer) - goto fail; - b->last_task = NULL; b->last_is_kernel = -1; b->tracing = 0; b->buffer_size = buffer_size; - b->tail_pos = 0; - b->head_pos = 0; b->sample_received = 0; b->sample_lost_overflow = 0; b->backtrace_aborted = 0; @@ -124,73 +144,156 @@ void end_cpu_work(void) flush_scheduled_work(); } -/* Resets the cpu buffer to a sane state. */ -void cpu_buffer_reset(struct oprofile_cpu_buffer *cpu_buf) +/* + * This function prepares the cpu buffer to write a sample. + * + * Struct op_entry is used during operations on the ring buffer while + * struct op_sample contains the data that is stored in the ring + * buffer. Struct entry can be uninitialized. The function reserves a + * data array that is specified by size. Use + * op_cpu_buffer_write_commit() after preparing the sample. In case of + * errors a null pointer is returned, otherwise the pointer to the + * sample. + * + */ +struct op_sample +*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size) { - /* reset these to invalid values; the next sample - * collected will populate the buffer with proper - * values to initialize the buffer - */ - cpu_buf->last_is_kernel = -1; - cpu_buf->last_task = NULL; + entry->event = ring_buffer_lock_reserve + (op_ring_buffer_write, sizeof(struct op_sample) + + size * sizeof(entry->sample->data[0]), &entry->irq_flags); + if (entry->event) + entry->sample = ring_buffer_event_data(entry->event); + else + entry->sample = NULL; + + if (!entry->sample) + return NULL; + + entry->size = size; + entry->data = entry->sample->data; + + return entry->sample; } -/* compute number of available slots in cpu_buffer queue */ -static unsigned long nr_available_slots(struct oprofile_cpu_buffer const *b) +int op_cpu_buffer_write_commit(struct op_entry *entry) { - unsigned long head = b->head_pos; - unsigned long tail = b->tail_pos; + return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event, + entry->irq_flags); +} - if (tail > head) - return (tail - head) - 1; +struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu) +{ + struct ring_buffer_event *e; + e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); + if (e) + goto event; + if (ring_buffer_swap_cpu(op_ring_buffer_read, + op_ring_buffer_write, + cpu)) + return NULL; + e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); + if (e) + goto event; + return NULL; + +event: + entry->event = e; + entry->sample = ring_buffer_event_data(e); + entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample)) + / sizeof(entry->sample->data[0]); + entry->data = entry->sample->data; + return entry->sample; +} - return tail + (b->buffer_size - head) - 1; +unsigned long op_cpu_buffer_entries(int cpu) +{ + return ring_buffer_entries_cpu(op_ring_buffer_read, cpu) + + ring_buffer_entries_cpu(op_ring_buffer_write, cpu); } -static void increment_head(struct oprofile_cpu_buffer *b) +static int +op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace, + int is_kernel, struct task_struct *task) { - unsigned long new_head = b->head_pos + 1; + struct op_entry entry; + struct op_sample *sample; + unsigned long flags; + int size; + + flags = 0; - /* Ensure anything written to the slot before we - * increment is visible */ - wmb(); + if (backtrace) + flags |= TRACE_BEGIN; + + /* notice a switch from user->kernel or vice versa */ + is_kernel = !!is_kernel; + if (cpu_buf->last_is_kernel != is_kernel) { + cpu_buf->last_is_kernel = is_kernel; + flags |= KERNEL_CTX_SWITCH; + if (is_kernel) + flags |= IS_KERNEL; + } + + /* notice a task switch */ + if (cpu_buf->last_task != task) { + cpu_buf->last_task = task; + flags |= USER_CTX_SWITCH; + } + + if (!flags) + /* nothing to do */ + return 0; - if (new_head < b->buffer_size) - b->head_pos = new_head; + if (flags & USER_CTX_SWITCH) + size = 1; else - b->head_pos = 0; -} + size = 0; -static inline void -add_sample(struct oprofile_cpu_buffer *cpu_buf, - unsigned long pc, unsigned long event) -{ - struct op_sample *entry = &cpu_buf->buffer[cpu_buf->head_pos]; - entry->eip = pc; - entry->event = event; - increment_head(cpu_buf); + sample = op_cpu_buffer_write_reserve(&entry, size); + if (!sample) + return -ENOMEM; + + sample->eip = ESCAPE_CODE; + sample->event = flags; + + if (size) + op_cpu_buffer_add_data(&entry, (unsigned long)task); + + op_cpu_buffer_write_commit(&entry); + + return 0; } -static inline void -add_code(struct oprofile_cpu_buffer *buffer, unsigned long value) +static inline int +op_add_sample(struct oprofile_cpu_buffer *cpu_buf, + unsigned long pc, unsigned long event) { - add_sample(buffer, ESCAPE_CODE, value); + struct op_entry entry; + struct op_sample *sample; + + sample = op_cpu_buffer_write_reserve(&entry, 0); + if (!sample) + return -ENOMEM; + + sample->eip = pc; + sample->event = event; + + return op_cpu_buffer_write_commit(&entry); } -/* This must be safe from any context. It's safe writing here - * because of the head/tail separation of the writer and reader - * of the CPU buffer. +/* + * This must be safe from any context. * * is_kernel is needed because on some architectures you cannot * tell if you are in kernel or user space simply by looking at * pc. We tag this in the buffer by generating kernel enter/exit * events whenever is_kernel changes */ -static int log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, - int is_kernel, unsigned long event) +static int +log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, + unsigned long backtrace, int is_kernel, unsigned long event) { - struct task_struct *task; - cpu_buf->sample_received++; if (pc == ESCAPE_CODE) { @@ -198,131 +301,115 @@ static int log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, return 0; } - if (nr_available_slots(cpu_buf) < 3) { - cpu_buf->sample_lost_overflow++; - return 0; - } - - is_kernel = !!is_kernel; + if (op_add_code(cpu_buf, backtrace, is_kernel, current)) + goto fail; - task = current; + if (op_add_sample(cpu_buf, pc, event)) + goto fail; - /* notice a switch from user->kernel or vice versa */ - if (cpu_buf->last_is_kernel != is_kernel) { - cpu_buf->last_is_kernel = is_kernel; - add_code(cpu_buf, is_kernel); - } - - /* notice a task switch */ - if (cpu_buf->last_task != task) { - cpu_buf->last_task = task; - add_code(cpu_buf, (unsigned long)task); - } - - add_sample(cpu_buf, pc, event); return 1; + +fail: + cpu_buf->sample_lost_overflow++; + return 0; } -static int oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf) +static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf) { - if (nr_available_slots(cpu_buf) < 4) { - cpu_buf->sample_lost_overflow++; - return 0; - } - - add_code(cpu_buf, CPU_TRACE_BEGIN); cpu_buf->tracing = 1; - return 1; } -static void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf) +static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf) { cpu_buf->tracing = 0; } -void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, - unsigned long event, int is_kernel) |