aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu/mcheck/mce.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce.c')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c792
1 files changed, 542 insertions, 250 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5a11ae2e9e9..9a79c8dbd8e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -7,6 +7,9 @@
* Copyright 2008 Intel Corporation
* Author: Andi Kleen
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
@@ -55,35 +58,24 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
-int mce_disabled __read_mostly;
-
-#define MISC_MCELOG_MINOR 227
-
#define SPINUNIT 100 /* 100ns */
-atomic_t mce_entry;
-
DEFINE_PER_CPU(unsigned, mce_exception_count);
-/*
- * Tolerant levels:
- * 0: always panic on uncorrected errors, log corrected errors
- * 1: panic or SIGBUS on uncorrected errors, log corrected errors
- * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
- * 3: never panic or SIGBUS, log all errors (for testing only)
- */
-static int tolerant __read_mostly = 1;
-static int banks __read_mostly;
-static int rip_msr __read_mostly;
-static int mce_bootlog __read_mostly = -1;
-static int monarch_timeout __read_mostly = -1;
-static int mce_panic_timeout __read_mostly;
-static int mce_dont_log_ce __read_mostly;
-int mce_cmci_disabled __read_mostly;
-int mce_ignore_ce __read_mostly;
-int mce_ser __read_mostly;
-
-struct mce_bank *mce_banks __read_mostly;
+struct mce_bank *mce_banks __read_mostly;
+
+struct mca_config mca_cfg __read_mostly = {
+ .bootlog = -1,
+ /*
+ * Tolerant levels:
+ * 0: always panic on uncorrected errors, log corrected errors
+ * 1: panic or SIGBUS on uncorrected errors, log corrected errors
+ * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
+ * 3: never panic or SIGBUS, log all errors (for testing only)
+ */
+ .tolerant = 1,
+ .monarch_timeout = -1
+};
/* User mode helper program triggered by machine check event */
static unsigned long mce_need_notify;
@@ -95,13 +87,30 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
-/* MCA banks polled by the period polling timer for corrected events */
+/* CMCI storm detection filter */
+static DEFINE_PER_CPU(unsigned long, mce_polled_error);
+
+/*
+ * MCA banks polled by the period polling timer for corrected events.
+ * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
+ */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};
+/*
+ * MCA banks controlled through firmware first for corrected errors.
+ * This is a global list of banks for which we won't enable CMCI and we
+ * won't poll. Firmware controls these banks and is responsible for
+ * reporting corrected errors through GHES. Uncorrected/recoverable
+ * errors are still notified through a machine check.
+ */
+mce_banks_t mce_banks_ce_disabled;
+
static DEFINE_PER_CPU(struct work_struct, mce_work);
+static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
@@ -191,7 +200,7 @@ static void drain_mcelog_buffer(void)
{
unsigned int next, i, prev = 0;
- next = rcu_dereference_check_mce(mcelog.next);
+ next = ACCESS_ONCE(mcelog.next);
do {
struct mce *m;
@@ -210,7 +219,7 @@ static void drain_mcelog_buffer(void)
cpu_relax();
if (!m->finished && retries >= 4) {
- pr_err("MCE: skipping error being logged currently!\n");
+ pr_err("skipping error being logged currently!\n");
break;
}
}
@@ -298,7 +307,7 @@ static void wait_for_panic(void)
while (timeout-- > 0)
udelay(1);
if (panic_timeout == 0)
- panic_timeout = mce_panic_timeout;
+ panic_timeout = mca_cfg.panic_timeout;
panic("Panicing machine check CPU died");
}
@@ -356,7 +365,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
if (!fake_panic) {
if (panic_timeout == 0)
- panic_timeout = mce_panic_timeout;
+ panic_timeout = mca_cfg.panic_timeout;
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
@@ -368,7 +377,7 @@ static int msr_to_offset(u32 msr)
{
unsigned bank = __this_cpu_read(injectm.bank);
- if (msr == rip_msr)
+ if (msr == mca_cfg.rip_msr)
return offsetof(struct mce, ip);
if (msr == MSR_IA32_MCx_STATUS(bank))
return offsetof(struct mce, status);
@@ -437,10 +446,18 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
m->ip = regs->ip;
m->cs = regs->cs;
+
+ /*
+ * When in VM86 mode make the cs look like ring 3
+ * always. This is a lie, but it's better than passing
+ * the additional vm86 bit around everywhere.
+ */
+ if (v8086_mode(regs))
+ m->cs |= 3;
}
/* Use accurate RIP reporting if available. */
- if (rip_msr)
- m->ip = mce_rdmsrl(rip_msr);
+ if (mca_cfg.rip_msr)
+ m->ip = mce_rdmsrl(mca_cfg.rip_msr);
}
}
@@ -501,18 +518,15 @@ static int mce_ring_add(unsigned long pfn)
int mce_available(struct cpuinfo_x86 *c)
{
- if (mce_disabled)
+ if (mca_cfg.disabled)
return 0;
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
static void mce_schedule_work(void)
{
- if (!mce_ring_empty()) {
- struct work_struct *work = &__get_cpu_var(mce_work);
- if (!work_pending(work))
- schedule_work(work);
- }
+ if (!mce_ring_empty())
+ schedule_work(&__get_cpu_var(mce_work));
}
DEFINE_PER_CPU(struct irq_work, mce_irq_work);
@@ -540,6 +554,27 @@ static void mce_report_event(struct pt_regs *regs)
irq_work_queue(&__get_cpu_var(mce_irq_work));
}
+/*
+ * Read ADDR and MISC registers.
+ */
+static void mce_read_aux(struct mce *m, int i)
+{
+ if (m->status & MCI_STATUS_MISCV)
+ m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
+ if (m->status & MCI_STATUS_ADDRV) {
+ m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+
+ /*
+ * Mask the reported address by the reported granularity.
+ */
+ if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
+ u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+ m->addr >>= shift;
+ m->addr <<= shift;
+ }
+ }
+}
+
DEFINE_PER_CPU(unsigned, mce_poll_count);
/*
@@ -562,11 +597,11 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
struct mce m;
int i;
- percpu_inc(mce_poll_count);
+ this_cpu_inc(mce_poll_count);
mce_gather_info(&m, NULL);
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
@@ -580,6 +615,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(m.status & MCI_STATUS_VAL))
continue;
+ this_cpu_write(mce_polled_error, 1);
/*
* Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here.
@@ -587,13 +623,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* TBD do the same check for MCI_STATUS_EN here?
*/
if (!(flags & MCP_UC) &&
- (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
+ (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
continue;
- if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
- if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+ mce_read_aux(&m, i);
if (!(flags & MCP_TIMESTAMP))
m.tsc = 0;
@@ -601,7 +634,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce)
+ if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
mce_log(&m);
/*
@@ -623,16 +656,22 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
-static int mce_no_way_out(struct mce *m, char **msg)
+static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+ struct pt_regs *regs)
{
- int i;
+ int i, ret = 0;
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
- if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
- return 1;
+ if (m->status & MCI_STATUS_VAL) {
+ __set_bit(i, validp);
+ if (quirk_no_way_out)
+ quirk_no_way_out(i, m, regs);
+ }
+ if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY)
+ ret = 1;
}
- return 0;
+ return ret;
}
/*
@@ -660,11 +699,10 @@ static int mce_timed_out(u64 *t)
rmb();
if (atomic_read(&mce_paniced))
wait_for_panic();
- if (!monarch_timeout)
+ if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
- /* CHECKME: Make panic default for 1 too? */
- if (tolerant < 1)
+ if (mca_cfg.tolerant <= 1)
mce_panic("Timeout synchronizing machine check over CPUs",
NULL, NULL);
cpu_missing = 1;
@@ -714,7 +752,8 @@ static void mce_reign(void)
* Grade the severity of the errors of all the CPUs.
*/
for_each_possible_cpu(cpu) {
- int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
+ int severity = mce_severity(&per_cpu(mces_seen, cpu),
+ mca_cfg.tolerant,
&nmsg);
if (severity > global_worst) {
msg = nmsg;
@@ -728,7 +767,7 @@ static void mce_reign(void)
* This dumps all the mces in the log buffer and stops the
* other CPUs.
*/
- if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
+ if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
mce_panic("Fatal Machine check", m, msg);
/*
@@ -741,7 +780,7 @@ static void mce_reign(void)
* No machine check event found. Must be some external
* source or one CPU is hung. Panic.
*/
- if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
+ if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
mce_panic("Machine check from unknown source", NULL, NULL);
/*
@@ -765,7 +804,7 @@ static int mce_start(int *no_way_out)
{
int order;
int cpus = num_online_cpus();
- u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
if (!timeout)
return -1;
@@ -829,7 +868,7 @@ static int mce_start(int *no_way_out)
static int mce_end(int order)
{
int ret = -1;
- u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
if (!timeout)
goto reset;
@@ -910,13 +949,58 @@ static void mce_clear_state(unsigned long *toclear)
{
int i;
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
if (test_bit(i, toclear))
mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
}
}
/*
+ * Need to save faulting physical address associated with a process
+ * in the machine check handler some place where we can grab it back
+ * later in mce_notify_process()
+ */
+#define MCE_INFO_MAX 16
+
+struct mce_info {
+ atomic_t inuse;
+ struct task_struct *t;
+ __u64 paddr;
+ int restartable;
+} mce_info[MCE_INFO_MAX];
+
+static void mce_save_info(__u64 addr, int c)
+{
+ struct mce_info *mi;
+
+ for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
+ if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
+ mi->t = current;
+ mi->paddr = addr;
+ mi->restartable = c;
+ return;
+ }
+ }
+
+ mce_panic("Too many concurrent recoverable errors", NULL, NULL);
+}
+
+static struct mce_info *mce_find_info(void)
+{
+ struct mce_info *mi;
+
+ for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
+ if (atomic_read(&mi->inuse) && mi->t == current)
+ return mi;
+ return NULL;
+}
+
+static void mce_clear_info(struct mce_info *mi)
+{
+ atomic_set(&mi->inuse, 0);
+}
+
+/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
*
@@ -930,6 +1014,7 @@ static void mce_clear_state(unsigned long *toclear)
*/
void do_machine_check(struct pt_regs *regs, long error_code)
{
+ struct mca_config *cfg = &mca_cfg;
struct mce m, *final;
int i;
int worst = 0;
@@ -941,7 +1026,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
int order;
/*
* If no_way_out gets set, there is no safe way to recover from this
- * MCE. If tolerant is cranked up, we'll try anyway.
+ * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
*/
int no_way_out = 0;
/*
@@ -950,13 +1035,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
*/
int kill_it = 0;
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
- atomic_inc(&mce_entry);
-
- percpu_inc(mce_exception_count);
+ this_cpu_inc(mce_exception_count);
- if (!banks)
+ if (!cfg->banks)
goto out;
mce_gather_info(&m, regs);
@@ -964,12 +1048,15 @@ void do_machine_check(struct pt_regs *regs, long error_code)
final = &__get_cpu_var(mces_seen);
*final = m;
- no_way_out = mce_no_way_out(&m, &msg);
+ memset(valid_banks, 0, sizeof(valid_banks));
+ no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
barrier();
/*
- * When no restart IP must always kill or panic.
+ * When no restart IP might need to kill or panic.
+ * Assume the worst for now, but if we find the
+ * severity is MCE_AR_SEVERITY we have other options.
*/
if (!(m.mcgstatus & MCG_STATUS_RIPV))
kill_it = 1;
@@ -980,8 +1067,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* because the first one to see it will clear it.
*/
order = mce_start(&no_way_out);
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < cfg->banks; i++) {
__clear_bit(i, toclear);
+ if (!test_bit(i, valid_banks))
+ continue;
if (!mce_banks[i].ctl)
continue;
@@ -997,16 +1086,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* Non uncorrected or non signaled errors are handled by
* machine_check_poll. Leave them alone, unless this panics.
*/
- if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
!no_way_out)
continue;
/*
* Set taint even when machine check was not enabled.
*/
- add_taint(TAINT_MACHINE_CHECK);
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
- severity = mce_severity(&m, tolerant, NULL);
+ severity = mce_severity(&m, cfg->tolerant, NULL);
/*
* When machine check was for corrected handler don't touch,
@@ -1023,23 +1112,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
continue;
}
- /*
- * Kill on action required.
- */
- if (severity == MCE_AR_SEVERITY)
- kill_it = 1;
-
- if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
- if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+ mce_read_aux(&m, i);
/*
* Action optional error. Queue address for later processing.
* When the ring overflows we just ignore the AO error.
* RED-PEN add some logging mechanism when
* usable_address or mce_add_ring fails.
- * RED-PEN don't ignore overflow for tolerant == 0
+ * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
*/
if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
mce_ring_add(m.addr >> PAGE_SHIFT);
@@ -1052,6 +1132,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
}
}
+ /* mce_clear_state will clear *final, save locally for use later */
+ m = *final;
+
if (!no_way_out)
mce_clear_state(toclear);
@@ -1063,65 +1146,91 @@ void do_machine_check(struct pt_regs *regs, long error_code)
no_way_out = worst >= MCE_PANIC_SEVERITY;
/*
- * If we have decided that we just CAN'T continue, and the user
- * has not set tolerant to an insane level, give up and die.
- *
- * This is mainly used in the case when the system doesn't
- * support MCE broadcasting or it has been disabled.
+ * At insane "tolerant" levels we take no action. Otherwise
+ * we only die if we have no other choice. For less serious
+ * issues we try to recover, or limit damage to the current
+ * process.
*/
- if (no_way_out && tolerant < 3)
- mce_panic("Fatal machine check on current CPU", final, msg);
-
- /*
- * If the error seems to be unrecoverable, something should be
- * done. Try to kill as little as possible. If we can kill just
- * one task, do that. If the user has set the tolerance very
- * high, don't try to do anything at all.
- */
-
- if (kill_it && tolerant < 3)
- force_sig(SIGBUS, current);
-
- /* notify userspace ASAP */
- set_thread_flag(TIF_MCE_NOTIFY);
+ if (cfg->tolerant < 3) {
+ if (no_way_out)
+ mce_panic("Fatal machine check on current CPU", &m, msg);
+ if (worst == MCE_AR_SEVERITY) {
+ /* schedule action before return to userland */
+ mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
+ set_thread_flag(TIF_MCE_NOTIFY);
+ } else if (kill_it) {
+ force_sig(SIGBUS, current);
+ }
+ }
if (worst > 0)
mce_report_event(regs);
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
out:
- atomic_dec(&mce_entry);
sync_core();
}
EXPORT_SYMBOL_GPL(do_machine_check);
-/* dummy to break dependency. actual code is in mm/memory-failure.c */
-void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+#ifndef CONFIG_MEMORY_FAILURE
+int memory_failure(unsigned long pfn, int vector, int flags)
{
- printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+ /* mce_severity() should not hand us an ACTION_REQUIRED error */
+ BUG_ON(flags & MF_ACTION_REQUIRED);
+ pr_err("Uncorrected memory error in page 0x%lx ignored\n"
+ "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
+ pfn);
+
+ return 0;
}
+#endif
/*
- * Called after mce notification in process context. This code
- * is allowed to sleep. Call the high level VM handler to process
- * any corrupted pages.
- * Assume that the work queue code only calls this one at a time
- * per CPU.
- * Note we don't disable preemption, so this code might run on the wrong
- * CPU. In this case the event is picked up by the scheduled work queue.
- * This is merely a fast path to expedite processing in some common
- * cases.
+ * Called in process context that interrupted by MCE and marked with
+ * TIF_MCE_NOTIFY, just before returning to erroneous userland.
+ * This code is allowed to sleep.
+ * Attempt possible recovery such as calling the high level VM handler to
+ * process any corrupted pages, and kill/signal current process if required.
+ * Action required errors are handled here.
*/
void mce_notify_process(void)
{
unsigned long pfn;
- mce_notify_irq();
- while (mce_ring_get(&pfn))
- memory_failure(pfn, MCE_VECTOR);
+ struct mce_info *mi = mce_find_info();
+ int flags = MF_ACTION_REQUIRED;
+
+ if (!mi)
+ mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
+ pfn = mi->paddr >> PAGE_SHIFT;
+
+ clear_thread_flag(TIF_MCE_NOTIFY);
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx",
+ mi->paddr);
+ /*
+ * We must call memory_failure() here even if the current process is
+ * doomed. We still need to mark the page as poisoned and alert any
+ * other users of the page.
+ */
+ if (!mi->restartable)
+ flags |= MF_MUST_KILL;
+ if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
+ pr_err("Memory error not recovered");
+ force_sig(SIGBUS, current);
+ }
+ mce_clear_info(mi);
}
+/*
+ * Action optional processing happens here (picking up
+ * from the list of faulting pages that do_machine_check()
+ * placed into the "ring").
+ */
static void mce_process_work(struct work_struct *dummy)
{
- mce_notify_process();
+ unsigned long pfn;
+
+ while (mce_ring_get(&pfn))
+ memory_failure(pfn, MCE_VECTOR, 0);
}
#ifdef CONFIG_X86_MCE_INTEL
@@ -1154,35 +1263,79 @@ void mce_log_therm_throt_event(__u64 status)
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
*/
-static int check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = 5 * 60; /* 5 minutes */
-static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mce_start_timer(unsigned long data)
+static unsigned long mce_adjust_timer_default(unsigned long interval)
+{
+ return interval;
+}
+
+static unsigned long (*mce_adjust_timer)(unsigned long interval) =
+ mce_adjust_timer_default;
+
+static int cmc_error_seen(void)
+{
+ unsigned long *v = &__get_cpu_var(mce_polled_error);
+
+ return test_and_clear_bit(0, v);
+}
+
+static void mce_timer_fn(unsigned long data)
{
- struct timer_list *t = &per_cpu(mce_timer, data);
- int *n;
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long iv;
+ int notify;
WARN_ON(smp_processor_id() != data);
if (mce_available(__this_cpu_ptr(&cpu_info))) {
machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
+ mce_intel_cmci_poll();
}
/*
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
- n = &__get_cpu_var(mce_next_interval);
- if (mce_notify_irq())
- *n = max(*n/2, HZ/100);
- else
- *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
+ iv = __this_cpu_read(mce_next_interval);
+ notify = mce_notify_irq();
+ notify |= cmc_error_seen();
+ if (notify) {
+ iv = max(iv / 2, (unsigned long) HZ/100);
+ } else {
+ iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+ iv = mce_adjust_timer(iv);
+ }
+ __this_cpu_write(mce_next_interval, iv);
+ /* Might have become 0 after CMCI storm subsided */
+ if (iv) {
+ t->expires = jiffies + iv;
+ add_timer_on(t, smp_processor_id());
+ }
+}
- t->expires = jiffies + *n;
- add_timer_on(t, smp_processor_id());
+/*
+ * Ensure that the timer is firing in @interval from now.
+ */
+void mce_timer_kick(unsigned long interval)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long when = jiffies + interval;
+ unsigned long iv = __this_cpu_read(mce_next_interval);
+
+ if (timer_pending(t)) {
+ if (time_before(when, t->expires))
+ mod_timer_pinned(t, when);
+ } else {
+ t->expires = round_jiffies(when);
+ add_timer_on(t, smp_processor_id());
+ }
+ if (interval < iv)
+ __this_cpu_write(mce_next_interval, interval);
}
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -1211,18 +1364,11 @@ int mce_notify_irq(void)
/* Not more than two messages every minute */
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
- clear_thread_flag(TIF_MCE_NOTIFY);
-
if (test_and_clear_bit(0, &mce_need_notify)) {
/* wake processes polling /dev/mcelog */
wake_up_interruptible(&mce_chrdev_wait);
- /*
- * There is no risk of missing notifications because
- * work_pending is always cleared before the function is
- * executed.
- */
- if (mce_helper[0] && !work_pending(&mce_trigger_work))
+ if (mce_helper[0])
schedule_work(&mce_trigger_work);
if (__ratelimit(&ratelimit))
@@ -1234,14 +1380,16 @@ int mce_notify_irq(void)
}
EXPORT_SYMBOL_GPL(mce_notify_irq);
-static int __cpuinit __mcheck_cpu_mce_banks_init(void)
+static int __mcheck_cpu_mce_banks_init(void)
{
int i;
+ u8 num_banks = mca_cfg.banks;
- mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
+ mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
if (!mce_banks)
return -ENOMEM;
- for (i = 0; i < banks; i++) {
+
+ for (i = 0; i < num_banks; i++) {
struct mce_bank *b = &mce_banks[i];
b->ctl = -1ULL;
@@ -1253,7 +1401,7 @@ static int __cpuinit __mcheck_cpu_mce_banks_init(void)
/*
* Initialize Machine Checks for a CPU.
*/
-static int __cpuinit __mcheck_cpu_cap_init(void)
+static int __mcheck_cpu_cap_init(void)
{
unsigned b;
u64 cap;
@@ -1261,19 +1409,19 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
rdmsrl(MSR_IA32_MCG_CAP, cap);
b = cap & MCG_BANKCNT_MASK;
- if (!banks)
- printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
+ if (!mca_cfg.banks)
+ pr_info("CPU supports %d MCE banks\n", b);
if (b > MAX_NR_BANKS) {
- printk(KERN_WARNING
- "MCE: Using only %u machine check banks out of %u\n",
+ pr_warn("Using only %u machine check banks out of %u\n",
MAX_NR_BANKS, b);
b = MAX_NR_BANKS;
}
/* Don't support asymmetric configurations today */
- WARN_ON(banks != 0 && b != banks);
- banks = b;
+ WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
+ mca_cfg.banks = b;
+
if (!mce_banks) {
int err = __mcheck_cpu_mce_banks_init();
@@ -1283,25 +1431,29 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
/* Use accurate RIP reporting if available. */
if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
- rip_msr = MSR_IA32_MCG_EIP;
+ mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
if (cap & MCG_SER_P)
- mce_ser = 1;
+ mca_cfg.ser = true;
return 0;
}
static void __mcheck_cpu_init_generic(void)
{
+ enum mcp_flags m_fl = 0;
mce_banks_t all_banks;
u64 cap;
int i;
+ if (!mca_cfg.bootlog)
+ m_fl = MCP_DONTLOG;
+
/*
* Log the machine checks left over from the previous reset.
*/
bitmap_fill(all_banks, MAX_NR_BANKS);
- machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
+ machine_check_poll(MCP_UC | m_fl, &all_banks);
set_in_cr4(X86_CR4_MCE);
@@ -1309,7 +1461,7 @@ static void __mcheck_cpu_init_generic(void)
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (!b->init)
@@ -1319,17 +1471,47 @@ static void __mcheck_cpu_init_generic(void)
}
}
+/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 0)
+ return;
+ if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+ return;
+ if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+ MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+ MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+ MCACOD)) !=
+ (MCI_STATUS_UC|MCI_STATUS_EN|
+ MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+ MCI_STATUS_AR|MCACOD_INSTR))
+ return;
+
+ m->mcgstatus |= MCG_STATUS_EIPV;
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+}
+
/* Add per CPU specific workarounds here */
-static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
+static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{
+ struct mca_config *cfg = &mca_cfg;
+
if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
- pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
+ pr_info("unknown CPU type - not enabling MCE support\n");
return -EOPNOTSUPP;
}
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD) {
- if (c->x86 == 15 && banks > 4) {
+ if (c->x86 == 15 && cfg->banks > 4) {
/*
* disable GART TBL walk error reporting, which
* trips off incorrectly with the IOMMU & 3ware
@@ -1337,19 +1519,56 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
*/
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
}
- if (c->x86 <= 17 && mce_bootlog < 0) {
+ if (c->x86 <= 17 && cfg->bootlog < 0) {
/*
* Lots of broken BIOS around that don't clear them
* by default and leave crap in there. Don't log:
*/
- mce_bootlog = 0;
+ cfg->bootlog = 0;
}
/*
* Various K7s with broken bank 0 around. Always disable
* by default.
*/
- if (c->x86 == 6 && banks > 0)
+ if (c->x86 == 6 && cfg->banks > 0)
mce_banks[0].ctl = 0;
+
+ /*
+ * Turn off MC4_MISC thresholding banks on those models since
+ * they're not supported there.
+ */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+ int i;
+ u64 val, hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
+ 0x00000413, /* MC4_MISC0 */
+ 0xc0000408, /* MC4_MISC1 */
+ };
+
+ rdmsrl(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+
+ for (i = 0; i < ARRAY_SIZE(msrs); i++) {
+ rdmsrl(msrs[i], val);
+
+ /* CntP bit set? */
+ if (val & BIT_64(62)) {
+ val &= ~BIT_64(62);
+ wrmsrl(msrs[i], val);
+ }
+ }
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+ }
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1362,7 +1581,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* valid event later, merely don't write CTL0.
*/
- if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
+ if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
mce_banks[0].init = 0;
/*
@@ -1370,25 +1589,28 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* synchronization with a one second timeout.
*/
if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
- monarch_timeout < 0)
- monarch_timeout = USEC_PER_SEC;
+ cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
/*
* There are also broken BIOSes on some Pentium M and
* earlier systems:
*/
- if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
- mce_bootlog = 0;
+ if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
+ cfg->bootlog = 0;
+
+ if (c->x86 == 6 && c->x86_model == 45)
+ quirk_no_way_out = quirk_sandybridge_ifu;
}
- if (monarch_timeout < 0)
- monarch_timeout = 0;
- if (mce_bootlog != 0)
- mce_panic_timeout = 30;
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = 0;
+ if (cfg->bootlog != 0)
+ cfg->panic_timeout = 30;
return 0;
}
-static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
return 0;
@@ -1412,6 +1634,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
+ mce_adjust_timer = mce_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
@@ -1421,27 +1644,32 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
}
}
-static void __mcheck_cpu_init_timer(void)
+static void mce_start_timer(unsigned int cpu, struct timer_list *t)
{
- struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(mce_next_interval);
+ unsigned long iv = check_interval * HZ;
- setup_timer(t, mce_start_timer, smp_processor_id());
-
- if (mce_ignore_ce)
+ if (mca_cfg.ignore_ce || !iv)
return;
- *n = check_interval * HZ;
- if (!*n)
- return;
- t->expires = round_jiffies(jiffies + *n);
- add_timer_on(t, smp_processor_id());
+ per_cpu(mce_next_interval, cpu) = iv;
+
+ t->expires = round_jiffies(jiffies + iv);
+ add_timer_on(t, cpu);
+}
+
+static void __mcheck_cpu_init_timer(void)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned int cpu = smp_processor_id();
+
+ setup_timer(t, mce_timer_fn, cpu);
+ mce_start_timer(cpu, t);
}
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
{
- printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+ pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
smp_processor_id());
}
@@ -1453,9 +1681,9 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off:
*/
-void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
+void mcheck_cpu_init(struct cpuinfo_x86 *c)
{
- if (mce_disabled)
+ if (mca_cfg.disabled)
return;
if (__mcheck_cpu_ancient_init(c))
@@ -1465,7 +1693,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
return;
if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
- mce_disabled = 1;
+ mca_cfg.disabled = true;
return;
}
@@ -1541,6 +1769,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
/* Error or no more MCE record */
if (rc <= 0) {
mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
return rc;
}
rc = -EFAULT;
@@ -1718,6 +1952,25 @@ static struct miscdevice mce_chrdev_device = {
&mce_chrdev_ops,
};
+static void __mce_disable_bank(void *arg)
+{
+ int bank = *((int *)arg);
+ __clear_bit(bank, __get_cpu_var(mce_poll_banks));
+ cmci_disable_bank(bank);
+}
+
+void mce_disable_bank(int bank)
+{
+ if (bank >= mca_cfg.banks) {
+ pr_warn(FW_BUG
+ "Ignoring request to disable invalid MCA bank %d.\n",
+ bank);
+ return;
+ }
+ set_bit(bank, mce_banks_ce_disabled);
+ on_each_cpu(__mce_disable_bank, &bank, 1);
+}
+
/*
* mce=off Disables machine check
* mce=no_cmci Disables CMCI
@@ -1728,9 +1981,12 @@ static struct miscdevice mce_chrdev_device = {
* check, or 0 to not wait
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
+ * mce=bios_cmci_threshold Don't program the CMCI threshold
*/
static int __init mcheck_enable(char *str)
{
+ struct mca_config *cfg = &mca_cfg;
+
if (*str == 0) {
enable_p5_mce();
return 1;
@@ -1738,24 +1994,25 @@ static int __init mcheck_enable(char *str)
if (*str == '=')
str++;
if (!strcmp(str, "off"))
- mce_disabled = 1;
+ cfg->disabled = true;
else if (!strcmp(str, "no_cmci"))
- mce_cmci_disabled = 1;
+ cfg->cmci_disabled = true;
else if (!strcmp(str, "dont_log_ce"))
- mce_dont_log_ce = 1;
+ cfg->dont_log_ce = true;
else if (!strcmp(str, "ignore_ce"))
- mce_ignore_ce = 1;
+ cfg->ignore_ce = true;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
- mce_bootlog = (str[0] == 'b');
+ cfg->bootlog = (str[0] == 'b');
+ else if (!strcmp(str, "bios_cmci_threshold"))
+ cfg->bios_cmci_threshold = true;
else if (isdigit(str[0])) {
- get_option(&str, &tolerant);
+ get_option(&str, &(cfg->tolerant));
if (*str == ',') {
++str;
- get_option(&str, &monarch_timeout);
+ get_option(&str, &(cfg->monarch_timeout));
}
} else {
- printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
- str);
+ pr_info("mce argument %s ignored. Please use /sys\n", str);
return 0;
}
return 1;
@@ -1781,7 +2038,7 @@ static int mce_disable_error_reporting(void)
{
int i;
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
@@ -1859,9 +2116,8 @@ static struct bus_type mce_subsys = {
.dev_name = "machinecheck",
};
-struct device *mce_device[CONFIG_NR_CPUS];
+DEFINE_PER_CPU(struct device *, mce_device);
-__cpuinitdata
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
@@ -1921,15 +2177,15 @@ static ssize_t set_ignore_ce(struct device *s,
if (strict_strtoull(buf, 0, &new) < 0)
return -EINVAL;
- if (mce_ignore_ce ^ !!new) {
+ if (mca_cfg.ignore_ce ^ !!new) {
if (new) {
/* disable ce features */
mce_timer_delete_all();
on_each_cpu(mce_disable_cmci, NULL, 1);
- mce_ignore_ce = 1;
+ mca_cfg.ignore_ce = true;
} else {
/* enable ce features */
- mce_ignore_ce = 0;
+ mca_cfg.ignore_ce = false;
on_each_cpu(mce_enable_ce, (void *)1, 1);
}
}
@@ -1945,14 +2201,14 @@ static ssize_t set_cmci_disabled(struct device *s,
if (strict_strtoull(buf, 0, &new) < 0)
return -EINVAL;
- if (mce_cmci_disabled ^ !!new) {
+ if (mca_cfg.cmci_disabled ^ !!new) {
if (new) {
/* disable cmci */
on_each_cpu(mce_disable_cmci, NULL, 1);
- mce_cmci_disabled = 1;
+ mca_cfg.cmci_disabled = true;
} else {
/* enable cmci */
- mce_cmci_disabled = 0;
+ mca_cfg.cmci_disabled = false;
on_each_cpu(mce_enable_ce, NULL, 1);
}
}
@@ -1969,9 +2225,9 @@ static ssize_t store_int_with_restart(struct device *s,
}
static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
-static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
-static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
-static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
+static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
+static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
+static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
static struct dev_ext_attribute dev_attr_check_interval = {
__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
@@ -1979,13 +2235,13 @@ static struct dev_ext_attribute dev_attr_check_interval = {
};
static struct dev_ext_attribute dev_attr_ignore_ce = {
- __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
- &mce_ignore_ce
+ __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
+ &mca_cfg.ignore_ce
};
static struct dev_ext_attribute dev_attr_cmci_disabled = {
- __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
- &mce_cmci_disabled
+ __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
+ &mca_cfg.cmci_disabled
};
static struct device_attribute *mce_device_attrs[] = {
@@ -2007,7 +2263,7 @@ static void mce_device_release(struct device *dev)
}
/* Per cpu device init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_device_create(unsigned int cpu)
+static int mce_device_create(unsigned int cpu)
{
struct device *dev;
int err;
@@ -2024,21 +2280,23 @@ static __cpuinit int mce_device_create(unsigned int cpu)
dev->release = &mce_device_release;
err = device_register(dev);
- if (err)
+ if (err) {
+ put_device(dev);
return err;
+ }
for (i = 0; mce_device_attrs[i]; i++) {
err = device_create_file(dev, mce_device_attrs[i]);
if (err)
goto error;
}
- for (j = 0; j < banks; j++) {
+ for (j = 0; j < mca_cfg.banks; j++) {
err = device_create_file(dev, &mce_banks[j].attr);
if (err)
goto error2;
}
cpumask_set_cpu(cpu, mce_device_initialized);
- mce_device[cpu] = dev;
+ per_cpu(mce_device, cpu) = dev;
return 0;
error2:
@@ -2053,9 +2311,9 @@ error:
return err;
}
-static __cpuinit void mce_device_remove(unsigned int cpu)
+static void mce_device_remove(unsigned int cpu)
{
- struct device *dev = mce_device[cpu];
+ struct device *dev = per_cpu(mce_device, cpu);
int i;
if (!cpumask_test_cpu(cpu, mce_device_initialized))
@@ -2064,16 +2322,16 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
for (i = 0; mce_device_attrs[i]; i++)
device_remove_file(dev, mce_device_attrs[i]);
- for (i = 0; i < banks; i++)
+ for (i = 0; i < mca_cfg.banks; i++)
device_remove_file(dev, &mce_banks[i].attr);
device_unregister(dev);
cpumask_clear_cpu(cpu, mce_device_initialized);
- mce_device[cpu] = NULL;
+ per_cpu(mce_device, cpu) = NULL;
}
/* Make sure there are no machine checks on offlined CPUs. */
-static void __cpuinit mce_disable_cpu(void *h)
+static void mce_disable_cpu(void *h)
{
unsigned long action = *(unsigned long *)h;
int i;
@@ -2083,7 +2341,7 @@ static void __cpuinit mce_disable_cpu(void *h)
if (!(action & CPU_TASKS_FROZEN))
cmci_clear();
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
@@ -2091,7 +2349,7 @@ static void __cpuinit mce_disable_cpu(void *h)
}
}
-static void __cpuinit mce_reenable_cpu(void *h)
+static void mce_reenable_cpu(void *h)
{
unsigned long action = *(unsigned long *)h;
int i;
@@ -2101,7 +2359,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
if (!(action & CPU_TASKS_FROZEN))
cmci_reenable();
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
@@ -2110,48 +2368,43 @@ static void __cpuinit mce_reenable_cpu(void *h)
}
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int __cpuinit
+static int
mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct timer_list *t = &per_cpu(mce_timer, cpu);
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
mce_device_create(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
break;
case CPU_DEAD:
- case CPU_DEAD_FROZEN:
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
mce_device_remove(cpu);
+ mce_intel_hcpu_update(cpu);
break;
case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- del_timer_sync(t);
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ del_timer_sync(t);
break;
case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- if (!mce_ignore_ce && check_interval) {
- t->expires = round_jiffies(jiffies +
- __get_cpu_var(mce_next_interval));
- add_timer_on(t, cpu);
- }
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ mce_start_timer(cpu, t);
break;
- case CPU_POST_DEAD:
+ }
+
+ if (action == CPU_POST_DEAD) {
/* intentionally ignoring frozen here */
- cmci_rediscover(cpu);
- break;
+ cmci_rediscover();
}
+
return NOTIFY_OK;
}
-static struct notifier_block mce_cpu_notifier __cpuinitdata = {
+static struct notifier_block mce_cpu_notifier = {
.notifier_call = mce_cpu_callback,
};
@@ -2159,7 +2412,7 @@ static __init void mce_init_banks(void)
{
int i;
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
struct device_attribute *a = &b->attr;
@@ -2178,39 +2431,78 @@ static __init int mcheck_init_device(void)
int err;
int i = 0;
- if (!mce_available(&boot_cpu_data))
- return -EIO;
+ if (!mce_available(&boot_cpu_data)) {
+ err = -EIO;
+ goto err_out;
+ }
- zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
+ if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
mce_init_banks();
err = subsys_system_register(&mce_subsys, NULL);
if (err)
- return err;
+ goto err_out_mem;
+ cpu_notifier_register_begin();
for_each_online_cpu(i) {
err = mce_device_create(i);
- if (err)
- return err;
+ if (err) {
+ /*
+ * Register notifier anyway (and do not unreg it) so
+ * that we don't leave undeleted timers, see notifier
+ * callback above.
+ */
+ __register_hotcpu_notifier(&mce_cpu_notifier);
+ cpu_notifier_register_done();
+ goto err_device_create;
+ }
}
+ __register_hotcpu_notifier(&mce_cpu_notifier);
+ cpu_notifier_register_done();
+
register_syscore_ops(&mce_syscore_ops);
- register_hotcpu_notifier(&mce_cpu_notifier);
/* register character device /dev/mcelog */
- misc_register(&mce_chrdev_device);
+ err = misc_register(&mce_chrdev_device);
+ if (err)
+ goto err_register;
+
+ return 0;
+
+err_register:
+ unregister_syscore_ops(&mce_syscore_ops);
+
+err_device_create:
+ /*
+ * We didn't keep track of which devices were created above, but
+ * even if we had, the set of online cpus might have changed.
+ * Play safe and remove for every possible cpu, since
+ * mce_device_remove() will do the right thing.
+ */
+ for_each_possible_cpu(i)
+ mce_device_remove(i);
+
+err_out_mem:
+ free_cpumask_var(mce_device_initialized);
+
+err_out:
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
return err;
}
-device_initcall(mcheck_init_device);
+device_initcall_sync(mcheck_init_device);
/*
* Old style boot options parsing. Only for compatibility.
*/
static int __init mcheck_disable(char *str)
{
- mce_disabled = 1;
+ mca_cfg.disabled = true;
return 1;
}
__setup("nomce", mcheck_disable);