diff options
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce.c')
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 792 |
1 files changed, 542 insertions, 250 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2e9e9..9a79c8dbd8e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -7,6 +7,9 @@ * Copyright 2008 Intel Corporation * Author: Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/thread_info.h> #include <linux/capability.h> #include <linux/miscdevice.h> @@ -55,35 +58,24 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define CREATE_TRACE_POINTS #include <trace/events/mce.h> -int mce_disabled __read_mostly; - -#define MISC_MCELOG_MINOR 227 - #define SPINUNIT 100 /* 100ns */ -atomic_t mce_entry; - DEFINE_PER_CPU(unsigned, mce_exception_count); -/* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ -static int tolerant __read_mostly = 1; -static int banks __read_mostly; -static int rip_msr __read_mostly; -static int mce_bootlog __read_mostly = -1; -static int monarch_timeout __read_mostly = -1; -static int mce_panic_timeout __read_mostly; -static int mce_dont_log_ce __read_mostly; -int mce_cmci_disabled __read_mostly; -int mce_ignore_ce __read_mostly; -int mce_ser __read_mostly; - -struct mce_bank *mce_banks __read_mostly; +struct mce_bank *mce_banks __read_mostly; + +struct mca_config mca_cfg __read_mostly = { + .bootlog = -1, + /* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ + .tolerant = 1, + .monarch_timeout = -1 +}; /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; @@ -95,13 +87,30 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -/* MCA banks polled by the period polling timer for corrected events */ +/* CMCI storm detection filter */ +static DEFINE_PER_CPU(unsigned long, mce_polled_error); + +/* + * MCA banks polled by the period polling timer for corrected events. + * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). + */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; +/* + * MCA banks controlled through firmware first for corrected errors. + * This is a global list of banks for which we won't enable CMCI and we + * won't poll. Firmware controls these banks and is responsible for + * reporting corrected errors through GHES. Uncorrected/recoverable + * errors are still notified through a machine check. + */ +mce_banks_t mce_banks_ce_disabled; + static DEFINE_PER_CPU(struct work_struct, mce_work); +static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); + /* * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. @@ -191,7 +200,7 @@ static void drain_mcelog_buffer(void) { unsigned int next, i, prev = 0; - next = rcu_dereference_check_mce(mcelog.next); + next = ACCESS_ONCE(mcelog.next); do { struct mce *m; @@ -210,7 +219,7 @@ static void drain_mcelog_buffer(void) cpu_relax(); if (!m->finished && retries >= 4) { - pr_err("MCE: skipping error being logged currently!\n"); + pr_err("skipping error being logged currently!\n"); break; } } @@ -298,7 +307,7 @@ static void wait_for_panic(void) while (timeout-- > 0) udelay(1); if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; + panic_timeout = mca_cfg.panic_timeout; panic("Panicing machine check CPU died"); } @@ -356,7 +365,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; + panic_timeout = mca_cfg.panic_timeout; panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); @@ -368,7 +377,7 @@ static int msr_to_offset(u32 msr) { unsigned bank = __this_cpu_read(injectm.bank); - if (msr == rip_msr) + if (msr == mca_cfg.rip_msr) return offsetof(struct mce, ip); if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); @@ -437,10 +446,18 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { m->ip = regs->ip; m->cs = regs->cs; + + /* + * When in VM86 mode make the cs look like ring 3 + * always. This is a lie, but it's better than passing + * the additional vm86 bit around everywhere. + */ + if (v8086_mode(regs)) + m->cs |= 3; } /* Use accurate RIP reporting if available. */ - if (rip_msr) - m->ip = mce_rdmsrl(rip_msr); + if (mca_cfg.rip_msr) + m->ip = mce_rdmsrl(mca_cfg.rip_msr); } } @@ -501,18 +518,15 @@ static int mce_ring_add(unsigned long pfn) int mce_available(struct cpuinfo_x86 *c) { - if (mce_disabled) + if (mca_cfg.disabled) return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } static void mce_schedule_work(void) { - if (!mce_ring_empty()) { - struct work_struct *work = &__get_cpu_var(mce_work); - if (!work_pending(work)) - schedule_work(work); - } + if (!mce_ring_empty()) + schedule_work(&__get_cpu_var(mce_work)); } DEFINE_PER_CPU(struct irq_work, mce_irq_work); @@ -540,6 +554,27 @@ static void mce_report_event(struct pt_regs *regs) irq_work_queue(&__get_cpu_var(mce_irq_work)); } +/* + * Read ADDR and MISC registers. + */ +static void mce_read_aux(struct mce *m, int i) +{ + if (m->status & MCI_STATUS_MISCV) + m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); + if (m->status & MCI_STATUS_ADDRV) { + m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + + /* + * Mask the reported address by the reported granularity. + */ + if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { + u8 shift = MCI_MISC_ADDR_LSB(m->misc); + m->addr >>= shift; + m->addr <<= shift; + } + } +} + DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -562,11 +597,11 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; - percpu_inc(mce_poll_count); + this_cpu_inc(mce_poll_count); mce_gather_info(&m, NULL); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -580,6 +615,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(m.status & MCI_STATUS_VAL)) continue; + this_cpu_write(mce_polled_error, 1); /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -587,13 +623,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * TBD do the same check for MCI_STATUS_EN here? */ if (!(flags & MCP_UC) && - (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) + (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -601,7 +634,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) + if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) mce_log(&m); /* @@ -623,16 +656,22 @@ EXPORT_SYMBOL_GPL(machine_check_poll); * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static int mce_no_way_out(struct mce *m, char **msg) +static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, + struct pt_regs *regs) { - int i; + int i, ret = 0; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); - if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) - return 1; + if (m->status & MCI_STATUS_VAL) { + __set_bit(i, validp); + if (quirk_no_way_out) + quirk_no_way_out(i, m, regs); + } + if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) + ret = 1; } - return 0; + return ret; } /* @@ -660,11 +699,10 @@ static int mce_timed_out(u64 *t) rmb(); if (atomic_read(&mce_paniced)) wait_for_panic(); - if (!monarch_timeout) + if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - /* CHECKME: Make panic default for 1 too? */ - if (tolerant < 1) + if (mca_cfg.tolerant <= 1) mce_panic("Timeout synchronizing machine check over CPUs", NULL, NULL); cpu_missing = 1; @@ -714,7 +752,8 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, + int severity = mce_severity(&per_cpu(mces_seen, cpu), + mca_cfg.tolerant, &nmsg); if (severity > global_worst) { msg = nmsg; @@ -728,7 +767,7 @@ static void mce_reign(void) * This dumps all the mces in the log buffer and stops the * other CPUs. */ - if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) mce_panic("Fatal Machine check", m, msg); /* @@ -741,7 +780,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) mce_panic("Machine check from unknown source", NULL, NULL); /* @@ -765,7 +804,7 @@ static int mce_start(int *no_way_out) { int order; int cpus = num_online_cpus(); - u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; if (!timeout) return -1; @@ -829,7 +868,7 @@ static int mce_start(int *no_way_out) static int mce_end(int order) { int ret = -1; - u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; if (!timeout) goto reset; @@ -910,13 +949,58 @@ static void mce_clear_state(unsigned long *toclear) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { if (test_bit(i, toclear)) mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } /* + * Need to save faulting physical address associated with a process + * in the machine check handler some place where we can grab it back + * later in mce_notify_process() + */ +#define MCE_INFO_MAX 16 + +struct mce_info { + atomic_t inuse; + struct task_struct *t; + __u64 paddr; + int restartable; +} mce_info[MCE_INFO_MAX]; + +static void mce_save_info(__u64 addr, int c) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { + if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { + mi->t = current; + mi->paddr = addr; + mi->restartable = c; + return; + } + } + + mce_panic("Too many concurrent recoverable errors", NULL, NULL); +} + +static struct mce_info *mce_find_info(void) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) + if (atomic_read(&mi->inuse) && mi->t == current) + return mi; + return NULL; +} + +static void mce_clear_info(struct mce_info *mi) +{ + atomic_set(&mi->inuse, 0); +} + +/* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. * @@ -930,6 +1014,7 @@ static void mce_clear_state(unsigned long *toclear) */ void do_machine_check(struct pt_regs *regs, long error_code) { + struct mca_config *cfg = &mca_cfg; struct mce m, *final; int i; int worst = 0; @@ -941,7 +1026,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) int order; /* * If no_way_out gets set, there is no safe way to recover from this - * MCE. If tolerant is cranked up, we'll try anyway. + * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ int no_way_out = 0; /* @@ -950,13 +1035,12 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ int kill_it = 0; DECLARE_BITMAP(toclear, MAX_NR_BANKS); + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - atomic_inc(&mce_entry); - - percpu_inc(mce_exception_count); + this_cpu_inc(mce_exception_count); - if (!banks) + if (!cfg->banks) goto out; mce_gather_info(&m, regs); @@ -964,12 +1048,15 @@ void do_machine_check(struct pt_regs *regs, long error_code) final = &__get_cpu_var(mces_seen); *final = m; - no_way_out = mce_no_way_out(&m, &msg); + memset(valid_banks, 0, sizeof(valid_banks)); + no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); barrier(); /* - * When no restart IP must always kill or panic. + * When no restart IP might need to kill or panic. + * Assume the worst for now, but if we find the + * severity is MCE_AR_SEVERITY we have other options. */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; @@ -980,8 +1067,10 @@ void do_machine_check(struct pt_regs *regs, long error_code) * because the first one to see it will clear it. */ order = mce_start(&no_way_out); - for (i = 0; i < banks; i++) { + for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; if (!mce_banks[i].ctl) continue; @@ -997,16 +1086,16 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Non uncorrected or non signaled errors are handled by * machine_check_poll. Leave them alone, unless this panics. */ - if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && !no_way_out) continue; /* * Set taint even when machine check was not enabled. */ - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(&m, tolerant, NULL); + severity = mce_severity(&m, cfg->tolerant, NULL); /* * When machine check was for corrected handler don't touch, @@ -1023,23 +1112,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) continue; } - /* - * Kill on action required. - */ - if (severity == MCE_AR_SEVERITY) - kill_it = 1; - - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); /* * Action optional error. Queue address for later processing. * When the ring overflows we just ignore the AO error. * RED-PEN add some logging mechanism when * usable_address or mce_add_ring fails. - * RED-PEN don't ignore overflow for tolerant == 0 + * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 */ if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) mce_ring_add(m.addr >> PAGE_SHIFT); @@ -1052,6 +1132,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) } } + /* mce_clear_state will clear *final, save locally for use later */ + m = *final; + if (!no_way_out) mce_clear_state(toclear); @@ -1063,65 +1146,91 @@ void do_machine_check(struct pt_regs *regs, long error_code) no_way_out = worst >= MCE_PANIC_SEVERITY; /* - * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. - * - * This is mainly used in the case when the system doesn't - * support MCE broadcasting or it has been disabled. + * At insane "tolerant" levels we take no action. Otherwise + * we only die if we have no other choice. For less serious + * issues we try to recover, or limit damage to the current + * process. */ - if (no_way_out && tolerant < 3) - mce_panic("Fatal machine check on current CPU", final, msg); - - /* - * If the error seems to be unrecoverable, something should be - * done. Try to kill as little as possible. If we can kill just - * one task, do that. If the user has set the tolerance very - * high, don't try to do anything at all. - */ - - if (kill_it && tolerant < 3) - force_sig(SIGBUS, current); - - /* notify userspace ASAP */ - set_thread_flag(TIF_MCE_NOTIFY); + if (cfg->tolerant < 3) { + if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); + if (worst == MCE_AR_SEVERITY) { + /* schedule action before return to userland */ + mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); + set_thread_flag(TIF_MCE_NOTIFY); + } else if (kill_it) { + force_sig(SIGBUS, current); + } + } if (worst > 0) mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out: - atomic_dec(&mce_entry); sync_core(); } EXPORT_SYMBOL_GPL(do_machine_check); -/* dummy to break dependency. actual code is in mm/memory-failure.c */ -void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +#ifndef CONFIG_MEMORY_FAILURE +int memory_failure(unsigned long pfn, int vector, int flags) { - printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); + /* mce_severity() should not hand us an ACTION_REQUIRED error */ + BUG_ON(flags & MF_ACTION_REQUIRED); + pr_err("Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", + pfn); + + return 0; } +#endif /* - * Called after mce notification in process context. This code - * is allowed to sleep. Call the high level VM handler to process - * any corrupted pages. - * Assume that the work queue code only calls this one at a time - * per CPU. - * Note we don't disable preemption, so this code might run on the wrong - * CPU. In this case the event is picked up by the scheduled work queue. - * This is merely a fast path to expedite processing in some common - * cases. + * Called in process context that interrupted by MCE and marked with + * TIF_MCE_NOTIFY, just before returning to erroneous userland. + * This code is allowed to sleep. + * Attempt possible recovery such as calling the high level VM handler to + * process any corrupted pages, and kill/signal current process if required. + * Action required errors are handled here. */ void mce_notify_process(void) { unsigned long pfn; - mce_notify_irq(); - while (mce_ring_get(&pfn)) - memory_failure(pfn, MCE_VECTOR); + struct mce_info *mi = mce_find_info(); + int flags = MF_ACTION_REQUIRED; + + if (!mi) + mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); + pfn = mi->paddr >> PAGE_SHIFT; + + clear_thread_flag(TIF_MCE_NOTIFY); + + pr_err("Uncorrected hardware memory error in user-access at %llx", + mi->paddr); + /* + * We must call memory_failure() here even if the current process is + * doomed. We still need to mark the page as poisoned and alert any + * other users of the page. + */ + if (!mi->restartable) + flags |= MF_MUST_KILL; + if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { + pr_err("Memory error not recovered"); + force_sig(SIGBUS, current); + } + mce_clear_info(mi); } +/* + * Action optional processing happens here (picking up + * from the list of faulting pages that do_machine_check() + * placed into the "ring"). + */ static void mce_process_work(struct work_struct *dummy) { - mce_notify_process(); + unsigned long pfn; + + while (mce_ring_get(&pfn)) + memory_failure(pfn, MCE_VECTOR, 0); } #ifdef CONFIG_X86_MCE_INTEL @@ -1154,35 +1263,79 @@ void mce_log_therm_throt_event(__u64 status) * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ -static int check_interval = 5 * 60; /* 5 minutes */ +static unsigned long check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mce_start_timer(unsigned long data) +static unsigned long mce_adjust_timer_default(unsigned long interval) +{ + return interval; +} + +static unsigned long (*mce_adjust_timer)(unsigned long interval) = + mce_adjust_timer_default; + +static int cmc_error_seen(void) +{ + unsigned long *v = &__get_cpu_var(mce_polled_error); + + return test_and_clear_bit(0, v); +} + +static void mce_timer_fn(unsigned long data) { - struct timer_list *t = &per_cpu(mce_timer, data); - int *n; + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long iv; + int notify; WARN_ON(smp_processor_id() != data); if (mce_available(__this_cpu_ptr(&cpu_info))) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); + mce_intel_cmci_poll(); } /* * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(mce_next_interval); - if (mce_notify_irq()) - *n = max(*n/2, HZ/100); - else - *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + iv = __this_cpu_read(mce_next_interval); + notify = mce_notify_irq(); + notify |= cmc_error_seen(); + if (notify) { + iv = max(iv / 2, (unsigned long) HZ/100); + } else { + iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + iv = mce_adjust_timer(iv); + } + __this_cpu_write(mce_next_interval, iv); + /* Might have become 0 after CMCI storm subsided */ + if (iv) { + t->expires = jiffies + iv; + add_timer_on(t, smp_processor_id()); + } +} - t->expires = jiffies + *n; - add_timer_on(t, smp_processor_id()); +/* + * Ensure that the timer is firing in @interval from now. + */ +void mce_timer_kick(unsigned long interval) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long when = jiffies + interval; + unsigned long iv = __this_cpu_read(mce_next_interval); + + if (timer_pending(t)) { + if (time_before(when, t->expires)) + mod_timer_pinned(t, when); + } else { + t->expires = round_jiffies(when); + add_timer_on(t, smp_processor_id()); + } + if (interval < iv) + __this_cpu_write(mce_next_interval, interval); } /* Must not be called in IRQ context where del_timer_sync() can deadlock */ @@ -1211,18 +1364,11 @@ int mce_notify_irq(void) /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - clear_thread_flag(TIF_MCE_NOTIFY); - if (test_and_clear_bit(0, &mce_need_notify)) { /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); - /* - * There is no risk of missing notifications because - * work_pending is always cleared before the function is - * executed. - */ - if (mce_helper[0] && !work_pending(&mce_trigger_work)) + if (mce_helper[0]) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) @@ -1234,14 +1380,16 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); -static int __cpuinit __mcheck_cpu_mce_banks_init(void) +static int __mcheck_cpu_mce_banks_init(void) { int i; + u8 num_banks = mca_cfg.banks; - mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); if (!mce_banks) return -ENOMEM; - for (i = 0; i < banks; i++) { + + for (i = 0; i < num_banks; i++) { struct mce_bank *b = &mce_banks[i]; b->ctl = -1ULL; @@ -1253,7 +1401,7 @@ static int __cpuinit __mcheck_cpu_mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int __cpuinit __mcheck_cpu_cap_init(void) +static int __mcheck_cpu_cap_init(void) { unsigned b; u64 cap; @@ -1261,19 +1409,19 @@ static int __cpuinit __mcheck_cpu_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - if (!banks) - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + if (!mca_cfg.banks) + pr_info("CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { - printk(KERN_WARNING - "MCE: Using only %u machine check banks out of %u\n", + pr_warn("Using only %u machine check banks out of %u\n", MAX_NR_BANKS, b); b = MAX_NR_BANKS; } /* Don't support asymmetric configurations today */ - WARN_ON(banks != 0 && b != banks); - banks = b; + WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); + mca_cfg.banks = b; + if (!mce_banks) { int err = __mcheck_cpu_mce_banks_init(); @@ -1283,25 +1431,29 @@ static int __cpuinit __mcheck_cpu_cap_init(void) /* Use accurate RIP reporting if available. */ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - rip_msr = MSR_IA32_MCG_EIP; + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; if (cap & MCG_SER_P) - mce_ser = 1; + mca_cfg.ser = true; return 0; } static void __mcheck_cpu_init_generic(void) { + enum mcp_flags m_fl = 0; mce_banks_t all_banks; u64 cap; int i; + if (!mca_cfg.bootlog) + m_fl = MCP_DONTLOG; + /* * Log the machine checks left over from the previous reset. */ bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); + machine_check_poll(MCP_UC | m_fl, &all_banks); set_in_cr4(X86_CR4_MCE); @@ -1309,7 +1461,7 @@ static void __mcheck_cpu_init_generic(void) if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (!b->init) @@ -1319,17 +1471,47 @@ static void __mcheck_cpu_init_generic(void) } } +/* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + /* Add per CPU specific workarounds here */ -static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { + struct mca_config *cfg = &mca_cfg; + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + pr_info("unknown CPU type - not enabling MCE support\n"); return -EOPNOTSUPP; } /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) { + if (c->x86 == 15 && cfg->banks > 4) { /* * disable GART TBL walk error reporting, which * trips off incorrectly with the IOMMU & 3ware @@ -1337,19 +1519,56 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } - if (c->x86 <= 17 && mce_bootlog < 0) { + if (c->x86 <= 17 && cfg->bootlog < 0) { /* * Lots of broken BIOS around that don't clear them * by default and leave crap in there. Don't log: */ - mce_bootlog = 0; + cfg->bootlog = 0; } /* * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6 && banks > 0) + if (c->x86 == 6 && cfg->banks > 0) mce_banks[0].ctl = 0; + + /* + * Turn off MC4_MISC thresholding banks on those models since + * they're not supported there. + */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + int i; + u64 val, hwcr; + bool need_toggle; + u32 msrs[] = { + 0x00000413, /* MC4_MISC0 */ + 0xc0000408, /* MC4_MISC1 */ + }; + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + for (i = 0; i < ARRAY_SIZE(msrs); i++) { + rdmsrl(msrs[i], val); + + /* CntP bit set? */ + if (val & BIT_64(62)) { + val &= ~BIT_64(62); + wrmsrl(msrs[i], val); + } + } + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); + } } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1362,7 +1581,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) mce_banks[0].init = 0; /* @@ -1370,25 +1589,28 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * synchronization with a one second timeout. */ if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - monarch_timeout < 0) - monarch_timeout = USEC_PER_SEC; + cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; /* * There are also broken BIOSes on some Pentium M and * earlier systems: */ - if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) - mce_bootlog = 0; + if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) + cfg->bootlog = 0; + + if (c->x86 == 6 && c->x86_model == 45) + quirk_no_way_out = quirk_sandybridge_ifu; } - if (monarch_timeout < 0) - monarch_timeout = 0; - if (mce_bootlog != 0) - mce_panic_timeout = 30; + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = 0; + if (cfg->bootlog != 0) + cfg->panic_timeout = 30; return 0; } -static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) return 0; @@ -1412,6 +1634,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); + mce_adjust_timer = mce_intel_adjust_timer; break; case X86_VENDOR_AMD: mce_amd_feature_init(c); @@ -1421,27 +1644,32 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) } } -static void __mcheck_cpu_init_timer(void) +static void mce_start_timer(unsigned int cpu, struct timer_list *t) { - struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(mce_next_interval); + unsigned long iv = check_interval * HZ; - setup_timer(t, mce_start_timer, smp_processor_id()); - - if (mce_ignore_ce) + if (mca_cfg.ignore_ce || !iv) return; - *n = check_interval * HZ; - if (!*n) - return; - t->expires = round_jiffies(jiffies + *n); - add_timer_on(t, smp_processor_id()); + per_cpu(mce_next_interval, cpu) = iv; + + t->expires = round_jiffies(jiffies + iv); + add_timer_on(t, cpu); +} + +static void __mcheck_cpu_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned int cpu = smp_processor_id(); + + setup_timer(t, mce_timer_fn, cpu); + mce_start_timer(cpu, t); } /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); } @@ -1453,9 +1681,9 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ -void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) +void mcheck_cpu_init(struct cpuinfo_x86 *c) { - if (mce_disabled) + if (mca_cfg.disabled) return; if (__mcheck_cpu_ancient_init(c)) @@ -1465,7 +1693,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) return; if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { - mce_disabled = 1; + mca_cfg.disabled = true; return; } @@ -1541,6 +1769,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize) /* Error or no more MCE record */ if (rc <= 0) { mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; return rc; } rc = -EFAULT; @@ -1718,6 +1952,25 @@ static struct miscdevice mce_chrdev_device = { &mce_chrdev_ops, }; +static void __mce_disable_bank(void *arg) +{ + int bank = *((int *)arg); + __clear_bit(bank, __get_cpu_var(mce_poll_banks)); + cmci_disable_bank(bank); +} + +void mce_disable_bank(int bank) +{ + if (bank >= mca_cfg.banks) { + pr_warn(FW_BUG + "Ignoring request to disable invalid MCA bank %d.\n", + bank); + return; + } + set_bit(bank, mce_banks_ce_disabled); + on_each_cpu(__mce_disable_bank, &bank, 1); +} + /* * mce=off Disables machine check * mce=no_cmci Disables CMCI @@ -1728,9 +1981,12 @@ static struct miscdevice mce_chrdev_device = { * check, or 0 to not wait * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. + * mce=bios_cmci_threshold Don't program the CMCI threshold */ static int __init mcheck_enable(char *str) { + struct mca_config *cfg = &mca_cfg; + if (*str == 0) { enable_p5_mce(); return 1; @@ -1738,24 +1994,25 @@ static int __init mcheck_enable(char *str) if (*str == '=') str++; if (!strcmp(str, "off")) - mce_disabled = 1; + cfg->disabled = true; else if (!strcmp(str, "no_cmci")) - mce_cmci_disabled = 1; + cfg->cmci_disabled = true; else if (!strcmp(str, "dont_log_ce")) - mce_dont_log_ce = 1; + cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) - mce_ignore_ce = 1; + cfg->ignore_ce = true; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) - mce_bootlog = (str[0] == 'b'); + cfg->bootlog = (str[0] == 'b'); + else if (!strcmp(str, "bios_cmci_threshold")) + cfg->bios_cmci_threshold = true; else if (isdigit(str[0])) { - get_option(&str, &tolerant); + get_option(&str, &(cfg->tolerant)); if (*str == ',') { ++str; - get_option(&str, &monarch_timeout); + get_option(&str, &(cfg->monarch_timeout)); } } else { - printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", - str); + pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; } return 1; @@ -1781,7 +2038,7 @@ static int mce_disable_error_reporting(void) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -1859,9 +2116,8 @@ static struct bus_type mce_subsys = { .dev_name = "machinecheck", }; -struct device *mce_device[CONFIG_NR_CPUS]; +DEFINE_PER_CPU(struct device *, mce_device); -__cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) @@ -1921,15 +2177,15 @@ static ssize_t set_ignore_ce(struct device *s, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - if (mce_ignore_ce ^ !!new) { + if (mca_cfg.ignore_ce ^ !!new) { if (new) { /* disable ce features */ mce_timer_delete_all(); on_each_cpu(mce_disable_cmci, NULL, 1); - mce_ignore_ce = 1; + mca_cfg.ignore_ce = true; } else { /* enable ce features */ - mce_ignore_ce = 0; + mca_cfg.ignore_ce = false; on_each_cpu(mce_enable_ce, (void *)1, 1); } } @@ -1945,14 +2201,14 @@ static ssize_t set_cmci_disabled(struct device *s, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - if (mce_cmci_disabled ^ !!new) { + if (mca_cfg.cmci_disabled ^ !!new) { if (new) { /* disable cmci */ on_each_cpu(mce_disable_cmci, NULL, 1); - mce_cmci_disabled = 1; + mca_cfg.cmci_disabled = true; } else { /* enable cmci */ - mce_cmci_disabled = 0; + mca_cfg.cmci_disabled = false; on_each_cpu(mce_enable_ce, NULL, 1); } } @@ -1969,9 +2225,9 @@ static ssize_t store_int_with_restart(struct device *s, } static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); -static DEVICE_INT_ATTR(tolerant, 0644, tolerant); -static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); +static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); static struct dev_ext_attribute dev_attr_check_interval = { __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), @@ -1979,13 +2235,13 @@ static struct dev_ext_attribute dev_attr_check_interval = { }; static struct dev_ext_attribute dev_attr_ignore_ce = { - __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), - &mce_ignore_ce + __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), + &mca_cfg.ignore_ce }; static struct dev_ext_attribute dev_attr_cmci_disabled = { - __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), - &mce_cmci_disabled + __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), + &mca_cfg.cmci_disabled }; static struct device_attribute *mce_device_attrs[] = { @@ -2007,7 +2263,7 @@ static void mce_device_release(struct device *dev) } /* Per cpu device init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_device_create(unsigned int cpu) +static int mce_device_create(unsigned int cpu) { struct device *dev; int err; @@ -2024,21 +2280,23 @@ static __cpuinit int mce_device_create(unsigned int cpu) dev->release = &mce_device_release; err = device_register(dev); - if (err) + if (err) { + put_device(dev); return err; + } for (i = 0; mce_device_attrs[i]; i++) { err = device_create_file(dev, mce_device_attrs[i]); if (err) goto error; } - for (j = 0; j < banks; j++) { + for (j = 0; j < mca_cfg.banks; j++) { err = device_create_file(dev, &mce_banks[j].attr); if (err) goto error2; } cpumask_set_cpu(cpu, mce_device_initialized); - mce_device[cpu] = dev; + per_cpu(mce_device, cpu) = dev; return 0; error2: @@ -2053,9 +2311,9 @@ error: return err; } -static __cpuinit void mce_device_remove(unsigned int cpu) +static void mce_device_remove(unsigned int cpu) { - struct device *dev = mce_device[cpu]; + struct device *dev = per_cpu(mce_device, cpu); int i; if (!cpumask_test_cpu(cpu, mce_device_initialized)) @@ -2064,16 +2322,16 @@ static __cpuinit void mce_device_remove(unsigned int cpu) for (i = 0; mce_device_attrs[i]; i++) device_remove_file(dev, mce_device_attrs[i]); - for (i = 0; i < banks; i++) + for (i = 0; i < mca_cfg.banks; i++) device_remove_file(dev, &mce_banks[i].attr); device_unregister(dev); cpumask_clear_cpu(cpu, mce_device_initialized); - mce_device[cpu] = NULL; + per_cpu(mce_device, cpu) = NULL; } /* Make sure there are no machine checks on offlined CPUs. */ -static void __cpuinit mce_disable_cpu(void *h) +static void mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; @@ -2083,7 +2341,7 @@ static void __cpuinit mce_disable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2091,7 +2349,7 @@ static void __cpuinit mce_disable_cpu(void *h) } } -static void __cpuinit mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; @@ -2101,7 +2359,7 @@ static void __cpuinit mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2110,48 +2368,43 @@ static void __cpuinit mce_reenable_cpu(void *h) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit +static int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: mce_device_create(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; case CPU_DEAD: - case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); mce_device_remove(cpu); + mce_intel_hcpu_update(cpu); break; case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - del_timer_sync(t); smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + del_timer_sync(t); break; case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - if (!mce_ignore_ce && check_interval) { - t->expires = round_jiffies(jiffies + - __get_cpu_var(mce_next_interval)); - add_timer_on(t, cpu); - } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + mce_start_timer(cpu, t); break; - case CPU_POST_DEAD: + } + + if (action == CPU_POST_DEAD) { /* intentionally ignoring frozen here */ - cmci_rediscover(cpu); - break; + cmci_rediscover(); } + return NOTIFY_OK; } -static struct notifier_block mce_cpu_notifier __cpuinitdata = { +static struct notifier_block mce_cpu_notifier = { .notifier_call = mce_cpu_callback, }; @@ -2159,7 +2412,7 @@ static __init void mce_init_banks(void) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; struct device_attribute *a = &b->attr; @@ -2178,39 +2431,78 @@ static __init int mcheck_init_device(void) int err; int i = 0; - if (!mce_available(&boot_cpu_data)) - return -EIO; + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } - zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } mce_init_banks(); err = subsys_system_register(&mce_subsys, NULL); if (err) - return err; + goto err_out_mem; + cpu_notifier_register_begin(); for_each_online_cpu(i) { err = mce_device_create(i); - if (err) - return err; + if (err) { + /* + * Register notifier anyway (and do not unreg it) so + * that we don't leave undeleted timers, see notifier + * callback above. + */ + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + goto err_device_create; + } } + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + register_syscore_ops(&mce_syscore_ops); - register_hotcpu_notifier(&mce_cpu_notifier); /* register character device /dev/mcelog */ - misc_register(&mce_chrdev_device); + err = misc_register(&mce_chrdev_device); + if (err) + goto err_register; + + return 0; + +err_register: + unregister_syscore_ops(&mce_syscore_ops); + +err_device_create: + /* + * We didn't keep track of which devices were created above, but + * even if we had, the set of online cpus might have changed. + * Play safe and remove for every possible cpu, since + * mce_device_remove() will do the right thing. + */ + for_each_possible_cpu(i) + mce_device_remove(i); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); return err; } -device_initcall(mcheck_init_device); +device_initcall_sync(mcheck_init_device); /* * Old style boot options parsing. Only for compatibility. */ static int __init mcheck_disable(char *str) { - mce_disabled = 1; + mca_cfg.disabled = true; return 1; } __setup("nomce", mcheck_disable); |
