aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu/mcheck/mce.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce.c')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c1290
1 files changed, 820 insertions, 470 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d916183b7f9..9a79c8dbd8e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -7,10 +7,12 @@
* Copyright 2008 Intel Corporation
* Author: Andi Kleen
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/thread_info.h>
#include <linux/capability.h>
#include <linux/miscdevice.h>
-#include <linux/interrupt.h>
#include <linux/ratelimit.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
@@ -20,7 +22,8 @@
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/string.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
+#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/ctype.h>
#include <linux/sched.h>
@@ -36,95 +39,84 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/debugfs.h>
-#include <linux/edac_mce.h>
+#include <linux/irq_work.h>
+#include <linux/export.h>
#include <asm/processor.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/ipi.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include "mce-internal.h"
-static DEFINE_MUTEX(mce_read_mutex);
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define rcu_dereference_check_mce(p) \
rcu_dereference_index_check((p), \
rcu_read_lock_sched_held() || \
- lockdep_is_held(&mce_read_mutex))
+ lockdep_is_held(&mce_chrdev_read_mutex))
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
-int mce_disabled __read_mostly;
-
-#define MISC_MCELOG_MINOR 227
-
#define SPINUNIT 100 /* 100ns */
-atomic_t mce_entry;
-
DEFINE_PER_CPU(unsigned, mce_exception_count);
-/*
- * Tolerant levels:
- * 0: always panic on uncorrected errors, log corrected errors
- * 1: panic or SIGBUS on uncorrected errors, log corrected errors
- * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
- * 3: never panic or SIGBUS, log all errors (for testing only)
- */
-static int tolerant __read_mostly = 1;
-static int banks __read_mostly;
-static int rip_msr __read_mostly;
-static int mce_bootlog __read_mostly = -1;
-static int monarch_timeout __read_mostly = -1;
-static int mce_panic_timeout __read_mostly;
-static int mce_dont_log_ce __read_mostly;
-int mce_cmci_disabled __read_mostly;
-int mce_ignore_ce __read_mostly;
-int mce_ser __read_mostly;
-
-struct mce_bank *mce_banks __read_mostly;
+struct mce_bank *mce_banks __read_mostly;
+
+struct mca_config mca_cfg __read_mostly = {
+ .bootlog = -1,
+ /*
+ * Tolerant levels:
+ * 0: always panic on uncorrected errors, log corrected errors
+ * 1: panic or SIGBUS on uncorrected errors, log corrected errors
+ * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
+ * 3: never panic or SIGBUS, log all errors (for testing only)
+ */
+ .tolerant = 1,
+ .monarch_timeout = -1
+};
/* User mode helper program triggered by machine check event */
static unsigned long mce_need_notify;
static char mce_helper[128];
static char *mce_helper_argv[2] = { mce_helper, NULL };
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
+/* CMCI storm detection filter */
+static DEFINE_PER_CPU(unsigned long, mce_polled_error);
+
/*
- * CPU/chipset specific EDAC code can register a notifier call here to print
- * MCE errors in a human-readable form.
+ * MCA banks polled by the period polling timer for corrected events.
+ * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
*/
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
-EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
-
-static int default_decode_mce(struct notifier_block *nb, unsigned long val,
- void *data)
-{
- pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
- pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
-
- return NOTIFY_STOP;
-}
-
-static struct notifier_block mce_dec_nb = {
- .notifier_call = default_decode_mce,
- .priority = -1,
-};
-
-/* MCA banks polled by the period polling timer for corrected events */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};
+/*
+ * MCA banks controlled through firmware first for corrected errors.
+ * This is a global list of banks for which we won't enable CMCI and we
+ * won't poll. Firmware controls these banks and is responsible for
+ * reporting corrected errors through GHES. Uncorrected/recoverable
+ * errors are still notified through a machine check.
+ */
+mce_banks_t mce_banks_ce_disabled;
+
static DEFINE_PER_CPU(struct work_struct, mce_work);
+static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
@@ -135,9 +127,7 @@ void mce_setup(struct mce *m)
m->time = get_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
m->cpuid = cpuid_eax(1);
-#ifdef CONFIG_SMP
m->socketid = cpu_data(m->extcpu).phys_proc_id;
-#endif
m->apicid = cpu_data(m->extcpu).initial_apicid;
rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
}
@@ -160,23 +150,20 @@ static struct mce_log mcelog = {
void mce_log(struct mce *mce)
{
unsigned next, entry;
+ int ret = 0;
/* Emit the trace record: */
trace_mce_record(mce);
+ ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+ if (ret == NOTIFY_STOP)
+ return;
+
mce->finished = 0;
wmb();
for (;;) {
entry = rcu_dereference_check_mce(mcelog.next);
for (;;) {
- /*
- * If edac_mce is enabled, it will check the error type
- * and will process it, if it is a known error.
- * Otherwise, the error will be sent through mcelog
- * interface
- */
- if (edac_mce_parse(mce))
- return;
/*
* When the buffer fills up discard new entries.
@@ -209,8 +196,61 @@ void mce_log(struct mce *mce)
set_bit(0, &mce_need_notify);
}
+static void drain_mcelog_buffer(void)
+{
+ unsigned int next, i, prev = 0;
+
+ next = ACCESS_ONCE(mcelog.next);
+
+ do {
+ struct mce *m;
+
+ /* drain what was logged during boot */
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+ unsigned retries = 1;
+
+ m = &mcelog.entry[i];
+
+ while (!m->finished) {
+ if (time_after_eq(jiffies, start + 2*retries))
+ retries++;
+
+ cpu_relax();
+
+ if (!m->finished && retries >= 4) {
+ pr_err("skipping error being logged currently!\n");
+ break;
+ }
+ }
+ smp_rmb();
+ atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+ }
+
+ memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
+}
+
+
+void mce_register_decode_chain(struct notifier_block *nb)
+{
+ atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
+ drain_mcelog_buffer();
+}
+EXPORT_SYMBOL_GPL(mce_register_decode_chain);
+
+void mce_unregister_decode_chain(struct notifier_block *nb)
+{
+ atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
+
static void print_mce(struct mce *m)
{
+ int ret = 0;
+
pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
m->extcpu, m->mcgstatus, m->bank, m->status);
@@ -231,14 +271,23 @@ static void print_mce(struct mce *m)
pr_cont("MISC %llx ", m->misc);
pr_cont("\n");
- pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
- m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
+ /*
+ * Note this output is parsed by external tools and old fields
+ * should not be changed.
+ */
+ pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
+ m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
+ cpu_data(m->extcpu).microcode);
/*
* Print out human-readable details about the MCE error,
* (if the CPU has an implementation for that)
*/
- atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+ ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+ if (ret == NOTIFY_STOP)
+ return;
+
+ pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
}
#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -258,7 +307,7 @@ static void wait_for_panic(void)
while (timeout-- > 0)
udelay(1);
if (panic_timeout == 0)
- panic_timeout = mce_panic_timeout;
+ panic_timeout = mca_cfg.panic_timeout;
panic("Panicing machine check CPU died");
}
@@ -316,7 +365,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
if (!fake_panic) {
if (panic_timeout == 0)
- panic_timeout = mce_panic_timeout;
+ panic_timeout = mca_cfg.panic_timeout;
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
@@ -328,7 +377,7 @@ static int msr_to_offset(u32 msr)
{
unsigned bank = __this_cpu_read(injectm.bank);
- if (msr == rip_msr)
+ if (msr == mca_cfg.rip_msr)
return offsetof(struct mce, ip);
if (msr == MSR_IA32_MCx_STATUS(bank))
return offsetof(struct mce, status);
@@ -380,6 +429,39 @@ static void mce_wrmsrl(u32 msr, u64 v)
}
/*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+{
+ mce_setup(m);
+
+ m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ if (regs) {
+ /*
+ * Get the address of the instruction at the time of
+ * the machine check error.
+ */
+ if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+
+ /*
+ * When in VM86 mode make the cs look like ring 3
+ * always. This is a lie, but it's better than passing
+ * the additional vm86 bit around everywhere.
+ */
+ if (v8086_mode(regs))
+ m->cs |= 3;
+ }
+ /* Use accurate RIP reporting if available. */
+ if (mca_cfg.rip_msr)
+ m->ip = mce_rdmsrl(mca_cfg.rip_msr);
+ }
+}
+
+/*
* Simple lockless ring to communicate PFNs from the exception handler with the
* process context work function. This is vastly simplified because there's
* only a single reader and a single writer.
@@ -436,54 +518,24 @@ static int mce_ring_add(unsigned long pfn)
int mce_available(struct cpuinfo_x86 *c)
{
- if (mce_disabled)
+ if (mca_cfg.disabled)
return 0;
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
static void mce_schedule_work(void)
{
- if (!mce_ring_empty()) {
- struct work_struct *work = &__get_cpu_var(mce_work);
- if (!work_pending(work))
- schedule_work(work);
- }
+ if (!mce_ring_empty())
+ schedule_work(&__get_cpu_var(mce_work));
}
-/*
- * Get the address of the instruction at the time of the machine check
- * error.
- */
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
-
- if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
- m->ip = regs->ip;
- m->cs = regs->cs;
- } else {
- m->ip = 0;
- m->cs = 0;
- }
- if (rip_msr)
- m->ip = mce_rdmsrl(rip_msr);
-}
+DEFINE_PER_CPU(struct irq_work, mce_irq_work);
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+static void mce_irq_work_cb(struct irq_work *entry)
{
- ack_APIC_irq();
- exit_idle();
- irq_enter();
mce_notify_irq();
mce_schedule_work();
- irq_exit();
}
-#endif
static void mce_report_event(struct pt_regs *regs)
{
@@ -499,29 +551,28 @@ static void mce_report_event(struct pt_regs *regs)
return;
}
-#ifdef CONFIG_X86_LOCAL_APIC
- /*
- * Without APIC do not notify. The event will be picked
- * up eventually.
- */
- if (!cpu_has_apic)
- return;
+ irq_work_queue(&__get_cpu_var(mce_irq_work));
+}
- /*
- * When interrupts are disabled we cannot use
- * kernel services safely. Trigger an self interrupt
- * through the APIC to instead do the notification
- * after interrupts are reenabled again.
- */
- apic->send_IPI_self(MCE_SELF_VECTOR);
+/*
+ * Read ADDR and MISC registers.
+ */
+static void mce_read_aux(struct mce *m, int i)
+{
+ if (m->status & MCI_STATUS_MISCV)
+ m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
+ if (m->status & MCI_STATUS_ADDRV) {
+ m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
- /*
- * Wait for idle afterwards again so that we don't leave the
- * APIC in a non idle state because the normal APIC writes
- * cannot exclude us.
- */
- apic_wait_icr_idle();
-#endif
+ /*
+ * Mask the reported address by the reported granularity.
+ */
+ if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
+ u8 shift = MCI_MISC_ADDR_LSB(m->misc);
+ m->addr >>= shift;
+ m->addr <<= shift;
+ }
+ }
}
DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -546,12 +597,11 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
struct mce m;
int i;
- percpu_inc(mce_poll_count);
+ this_cpu_inc(mce_poll_count);
- mce_setup(&m);
+ mce_gather_info(&m, NULL);
- m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
@@ -565,6 +615,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(m.status & MCI_STATUS_VAL))
continue;
+ this_cpu_write(mce_polled_error, 1);
/*
* Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here.
@@ -572,13 +623,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* TBD do the same check for MCI_STATUS_EN here?
*/
if (!(flags & MCP_UC) &&
- (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
+ (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
continue;
- if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
- if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+ mce_read_aux(&m, i);
if (!(flags & MCP_TIMESTAMP))
m.tsc = 0;
@@ -586,11 +634,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
+ if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
mce_log(&m);
- atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
- add_taint(TAINT_MACHINE_CHECK);
- }
/*
* Clear state for this bank.
@@ -611,16 +656,22 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
-static int mce_no_way_out(struct mce *m, char **msg)
+static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+ struct pt_regs *regs)
{
- int i;
+ int i, ret = 0;
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
- if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
- return 1;
+ if (m->status & MCI_STATUS_VAL) {
+ __set_bit(i, validp);
+ if (quirk_no_way_out)
+ quirk_no_way_out(i, m, regs);
+ }
+ if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY)
+ ret = 1;
}
- return 0;
+ return ret;
}
/*
@@ -648,11 +699,10 @@ static int mce_timed_out(u64 *t)
rmb();
if (atomic_read(&mce_paniced))
wait_for_panic();
- if (!monarch_timeout)
+ if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
- /* CHECKME: Make panic default for 1 too? */
- if (tolerant < 1)
+ if (mca_cfg.tolerant <= 1)
mce_panic("Timeout synchronizing machine check over CPUs",
NULL, NULL);
cpu_missing = 1;
@@ -702,7 +752,8 @@ static void mce_reign(void)
* Grade the severity of the errors of all the CPUs.
*/
for_each_possible_cpu(cpu) {
- int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
+ int severity = mce_severity(&per_cpu(mces_seen, cpu),
+ mca_cfg.tolerant,
&nmsg);
if (severity > global_worst) {
msg = nmsg;
@@ -716,7 +767,7 @@ static void mce_reign(void)
* This dumps all the mces in the log buffer and stops the
* other CPUs.
*/
- if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
+ if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
mce_panic("Fatal Machine check", m, msg);
/*
@@ -729,7 +780,7 @@ static void mce_reign(void)
* No machine check event found. Must be some external
* source or one CPU is hung. Panic.
*/
- if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
+ if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
mce_panic("Machine check from unknown source", NULL, NULL);
/*
@@ -753,7 +804,7 @@ static int mce_start(int *no_way_out)
{
int order;
int cpus = num_online_cpus();
- u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
if (!timeout)
return -1;
@@ -817,7 +868,7 @@ static int mce_start(int *no_way_out)
static int mce_end(int order)
{
int ret = -1;
- u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
+ u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
if (!timeout)
goto reset;
@@ -881,15 +932,15 @@ reset:
* Check if the address reported by the CPU is in a format we can parse.
* It would be possible to add code for most other cases, but all would
* be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses upto page granuality for now.
+ * parser). So only support physical addresses up to page granuality for now.
*/
static int mce_usable_address(struct mce *m)
{
if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
return 0;
- if ((m->misc & 0x3f) > PAGE_SHIFT)
+ if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
return 0;
- if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+ if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
return 0;
return 1;
}
@@ -898,13 +949,58 @@ static void mce_clear_state(unsigned long *toclear)
{
int i;
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
if (test_bit(i, toclear))
mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
}
}
/*
+ * Need to save faulting physical address associated with a process
+ * in the machine check handler some place where we can grab it back
+ * later in mce_notify_process()
+ */
+#define MCE_INFO_MAX 16
+
+struct mce_info {
+ atomic_t inuse;
+ struct task_struct *t;
+ __u64 paddr;
+ int restartable;
+} mce_info[MCE_INFO_MAX];
+
+static void mce_save_info(__u64 addr, int c)
+{
+ struct mce_info *mi;
+
+ for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
+ if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
+ mi->t = current;
+ mi->paddr = addr;
+ mi->restartable = c;
+ return;
+ }
+ }
+
+ mce_panic("Too many concurrent recoverable errors", NULL, NULL);
+}
+
+static struct mce_info *mce_find_info(void)
+{
+ struct mce_info *mi;
+
+ for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
+ if (atomic_read(&mi->inuse) && mi->t == current)
+ return mi;
+ return NULL;
+}
+
+static void mce_clear_info(struct mce_info *mi)
+{
+ atomic_set(&mi->inuse, 0);
+}
+
+/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
*
@@ -918,6 +1014,7 @@ static void mce_clear_state(unsigned long *toclear)
*/
void do_machine_check(struct pt_regs *regs, long error_code)
{
+ struct mca_config *cfg = &mca_cfg;
struct mce m, *final;
int i;
int worst = 0;
@@ -929,7 +1026,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
int order;
/*
* If no_way_out gets set, there is no safe way to recover from this
- * MCE. If tolerant is cranked up, we'll try anyway.
+ * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
*/
int no_way_out = 0;
/*
@@ -938,30 +1035,28 @@ void do_machine_check(struct pt_regs *regs, long error_code)
*/
int kill_it = 0;
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
+ DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
- atomic_inc(&mce_entry);
+ this_cpu_inc(mce_exception_count);
- percpu_inc(mce_exception_count);
-
- if (notify_die(DIE_NMI, "machine check", regs, error_code,
- 18, SIGKILL) == NOTIFY_STOP)
- goto out;
- if (!banks)
+ if (!cfg->banks)
goto out;
- mce_setup(&m);
+ mce_gather_info(&m, regs);
- m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
final = &__get_cpu_var(mces_seen);
*final = m;
- no_way_out = mce_no_way_out(&m, &msg);
+ memset(valid_banks, 0, sizeof(valid_banks));
+ no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
barrier();
/*
- * When no restart IP must always kill or panic.
+ * When no restart IP might need to kill or panic.
+ * Assume the worst for now, but if we find the
+ * severity is MCE_AR_SEVERITY we have other options.
*/
if (!(m.mcgstatus & MCG_STATUS_RIPV))
kill_it = 1;
@@ -972,8 +1067,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* because the first one to see it will clear it.
*/
order = mce_start(&no_way_out);
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < cfg->banks; i++) {
__clear_bit(i, toclear);
+ if (!test_bit(i, valid_banks))
+ continue;
if (!mce_banks[i].ctl)
continue;
@@ -989,16 +1086,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* Non uncorrected or non signaled errors are handled by
* machine_check_poll. Leave them alone, unless this panics.
*/
- if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
+ if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
!no_way_out)
continue;
/*
* Set taint even when machine check was not enabled.
*/
- add_taint(TAINT_MACHINE_CHECK);
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
- severity = mce_severity(&m, tolerant, NULL);
+ severity = mce_severity(&m, cfg->tolerant, NULL);
/*
* When machine check was for corrected handler don't touch,
@@ -1015,28 +1112,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)
continue;
}
- /*
- * Kill on action required.
- */
- if (severity == MCE_AR_SEVERITY)
- kill_it = 1;
-
- if (m.status & MCI_STATUS_MISCV)
- m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
- if (m.status & MCI_STATUS_ADDRV)
- m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
+ mce_read_aux(&m, i);
/*
* Action optional error. Queue address for later processing.
* When the ring overflows we just ignore the AO error.
* RED-PEN add some logging mechanism when
* usable_address or mce_add_ring fails.
- * RED-PEN don't ignore overflow for tolerant == 0
+ * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
*/
if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
mce_ring_add(m.addr >> PAGE_SHIFT);
- mce_get_rip(&m, regs);
mce_log(&m);
if (severity > worst) {
@@ -1045,6 +1132,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
}
}
+ /* mce_clear_state will clear *final, save locally for use later */
+ m = *final;
+
if (!no_way_out)
mce_clear_state(toclear);
@@ -1056,65 +1146,91 @@ void do_machine_check(struct pt_regs *regs, long error_code)
no_way_out = worst >= MCE_PANIC_SEVERITY;
/*
- * If we have decided that we just CAN'T continue, and the user
- * has not set tolerant to an insane level, give up and die.
- *
- * This is mainly used in the case when the system doesn't
- * support MCE broadcasting or it has been disabled.
+ * At insane "tolerant" levels we take no action. Otherwise
+ * we only die if we have no other choice. For less serious
+ * issues we try to recover, or limit damage to the current
+ * process.
*/
- if (no_way_out && tolerant < 3)
- mce_panic("Fatal machine check on current CPU", final, msg);
-
- /*
- * If the error seems to be unrecoverable, something should be
- * done. Try to kill as little as possible. If we can kill just
- * one task, do that. If the user has set the tolerance very
- * high, don't try to do anything at all.
- */
-
- if (kill_it && tolerant < 3)
- force_sig(SIGBUS, current);
-
- /* notify userspace ASAP */
- set_thread_flag(TIF_MCE_NOTIFY);
+ if (cfg->tolerant < 3) {
+ if (no_way_out)
+ mce_panic("Fatal machine check on current CPU", &m, msg);
+ if (worst == MCE_AR_SEVERITY) {
+ /* schedule action before return to userland */
+ mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
+ set_thread_flag(TIF_MCE_NOTIFY);
+ } else if (kill_it) {
+ force_sig(SIGBUS, current);
+ }
+ }
if (worst > 0)
mce_report_event(regs);
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
out:
- atomic_dec(&mce_entry);
sync_core();
}
EXPORT_SYMBOL_GPL(do_machine_check);
-/* dummy to break dependency. actual code is in mm/memory-failure.c */
-void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
+#ifndef CONFIG_MEMORY_FAILURE
+int memory_failure(unsigned long pfn, int vector, int flags)
{
- printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
+ /* mce_severity() should not hand us an ACTION_REQUIRED error */
+ BUG_ON(flags & MF_ACTION_REQUIRED);
+ pr_err("Uncorrected memory error in page 0x%lx ignored\n"
+ "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
+ pfn);
+
+ return 0;
}
+#endif
/*
- * Called after mce notification in process context. This code
- * is allowed to sleep. Call the high level VM handler to process
- * any corrupted pages.
- * Assume that the work queue code only calls this one at a time
- * per CPU.
- * Note we don't disable preemption, so this code might run on the wrong
- * CPU. In this case the event is picked up by the scheduled work queue.
- * This is merely a fast path to expedite processing in some common
- * cases.
+ * Called in process context that interrupted by MCE and marked with
+ * TIF_MCE_NOTIFY, just before returning to erroneous userland.
+ * This code is allowed to sleep.
+ * Attempt possible recovery such as calling the high level VM handler to
+ * process any corrupted pages, and kill/signal current process if required.
+ * Action required errors are handled here.
*/
void mce_notify_process(void)
{
unsigned long pfn;
- mce_notify_irq();
- while (mce_ring_get(&pfn))
- memory_failure(pfn, MCE_VECTOR);
+ struct mce_info *mi = mce_find_info();
+ int flags = MF_ACTION_REQUIRED;
+
+ if (!mi)
+ mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
+ pfn = mi->paddr >> PAGE_SHIFT;
+
+ clear_thread_flag(TIF_MCE_NOTIFY);
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx",
+ mi->paddr);
+ /*
+ * We must call memory_failure() here even if the current process is
+ * doomed. We still need to mark the page as poisoned and alert any
+ * other users of the page.
+ */
+ if (!mi->restartable)
+ flags |= MF_MUST_KILL;
+ if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
+ pr_err("Memory error not recovered");
+ force_sig(SIGBUS, current);
+ }
+ mce_clear_info(mi);
}
+/*
+ * Action optional processing happens here (picking up
+ * from the list of faulting pages that do_machine_check()
+ * placed into the "ring").
+ */
static void mce_process_work(struct work_struct *dummy)
{
- mce_notify_process();
+ unsigned long pfn;
+
+ while (mce_ring_get(&pfn))
+ memory_failure(pfn, MCE_VECTOR, 0);
}
#ifdef CONFIG_X86_MCE_INTEL
@@ -1147,35 +1263,88 @@ void mce_log_therm_throt_event(__u64 status)
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
*/
-static int check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = 5 * 60; /* 5 minutes */
-static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
+static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static void mce_start_timer(unsigned long data)
+static unsigned long mce_adjust_timer_default(unsigned long interval)
{
- struct timer_list *t = &per_cpu(mce_timer, data);
- int *n;
+ return interval;
+}
+
+static unsigned long (*mce_adjust_timer)(unsigned long interval) =
+ mce_adjust_timer_default;
+
+static int cmc_error_seen(void)
+{
+ unsigned long *v = &__get_cpu_var(mce_polled_error);
+
+ return test_and_clear_bit(0, v);
+}
+
+static void mce_timer_fn(unsigned long data)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long iv;
+ int notify;
WARN_ON(smp_processor_id() != data);
if (mce_available(__this_cpu_ptr(&cpu_info))) {
machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
+ mce_intel_cmci_poll();
}
/*
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
- n = &__get_cpu_var(mce_next_interval);
- if (mce_notify_irq())
- *n = max(*n/2, HZ/100);
- else
- *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
+ iv = __this_cpu_read(mce_next_interval);
+ notify = mce_notify_irq();
+ notify |= cmc_error_seen();
+ if (notify) {
+ iv = max(iv / 2, (unsigned long) HZ/100);
+ } else {
+ iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+ iv = mce_adjust_timer(iv);
+ }
+ __this_cpu_write(mce_next_interval, iv);
+ /* Might have become 0 after CMCI storm subsided */
+ if (iv) {
+ t->expires = jiffies + iv;
+ add_timer_on(t, smp_processor_id());
+ }
+}
+
+/*
+ * Ensure that the timer is firing in @interval from now.
+ */
+void mce_timer_kick(unsigned long interval)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long when = jiffies + interval;
+ unsigned long iv = __this_cpu_read(mce_next_interval);
- t->expires = jiffies + *n;
- add_timer_on(t, smp_processor_id());
+ if (timer_pending(t)) {
+ if (time_before(when, t->expires))
+ mod_timer_pinned(t, when);
+ } else {
+ t->expires = round_jiffies(when);
+ add_timer_on(t, smp_processor_id());
+ }
+ if (interval < iv)
+ __this_cpu_write(mce_next_interval, interval);
+}
+
+/* Must not be called in IRQ context where del_timer_sync() can deadlock */
+static void mce_timer_delete_all(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ del_timer_sync(&per_cpu(mce_timer, cpu));
}
static void mce_do_trigger(struct work_struct *work)
@@ -1195,17 +1364,11 @@ int mce_notify_irq(void)
/* Not more than two messages every minute */
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
- clear_thread_flag(TIF_MCE_NOTIFY);
-
if (test_and_clear_bit(0, &mce_need_notify)) {
- wake_up_interruptible(&mce_wait);
+ /* wake processes polling /dev/mcelog */
+ wake_up_interruptible(&mce_chrdev_wait);
- /*
- * There is no risk of missing notifications because
- * work_pending is always cleared before the function is
- * executed.
- */
- if (mce_helper[0] && !work_pending(&mce_trigger_work))
+ if (mce_helper[0])
schedule_work(&mce_trigger_work);
if (__ratelimit(&ratelimit))
@@ -1217,14 +1380,16 @@ int mce_notify_irq(void)
}
EXPORT_SYMBOL_GPL(mce_notify_irq);
-static int __cpuinit __mcheck_cpu_mce_banks_init(void)
+static int __mcheck_cpu_mce_banks_init(void)
{
int i;
+ u8 num_banks = mca_cfg.banks;
- mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
+ mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
if (!mce_banks)
return -ENOMEM;
- for (i = 0; i < banks; i++) {
+
+ for (i = 0; i < num_banks; i++) {
struct mce_bank *b = &mce_banks[i];
b->ctl = -1ULL;
@@ -1236,7 +1401,7 @@ static int __cpuinit __mcheck_cpu_mce_banks_init(void)
/*
* Initialize Machine Checks for a CPU.
*/
-static int __cpuinit __mcheck_cpu_cap_init(void)
+static int __mcheck_cpu_cap_init(void)
{
unsigned b;
u64 cap;
@@ -1244,19 +1409,19 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
rdmsrl(MSR_IA32_MCG_CAP, cap);
b = cap & MCG_BANKCNT_MASK;
- if (!banks)
- printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
+ if (!mca_cfg.banks)
+ pr_info("CPU supports %d MCE banks\n", b);
if (b > MAX_NR_BANKS) {
- printk(KERN_WARNING
- "MCE: Using only %u machine check banks out of %u\n",
+ pr_warn("Using only %u machine check banks out of %u\n",
MAX_NR_BANKS, b);
b = MAX_NR_BANKS;
}
/* Don't support asymmetric configurations today */
- WARN_ON(banks != 0 && b != banks);
- banks = b;
+ WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
+ mca_cfg.banks = b;
+
if (!mce_banks) {
int err = __mcheck_cpu_mce_banks_init();
@@ -1266,25 +1431,29 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
/* Use accurate RIP reporting if available. */
if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
- rip_msr = MSR_IA32_MCG_EIP;
+ mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
if (cap & MCG_SER_P)
- mce_ser = 1;
+ mca_cfg.ser = true;
return 0;
}
static void __mcheck_cpu_init_generic(void)
{
+ enum mcp_flags m_fl = 0;
mce_banks_t all_banks;
u64 cap;
int i;
+ if (!mca_cfg.bootlog)
+ m_fl = MCP_DONTLOG;
+
/*
* Log the machine checks left over from the previous reset.
*/
bitmap_fill(all_banks, MAX_NR_BANKS);
- machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
+ machine_check_poll(MCP_UC | m_fl, &all_banks);
set_in_cr4(X86_CR4_MCE);
@@ -1292,7 +1461,7 @@ static void __mcheck_cpu_init_generic(void)
if (cap & MCG_CTL_P)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (!b->init)
@@ -1302,17 +1471,47 @@ static void __mcheck_cpu_init_generic(void)
}
}
+/*
+ * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
+ * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
+ * Vol 3B Table 15-20). But this confuses both the code that determines
+ * whether the machine check occurred in kernel or user mode, and also
+ * the severity assessment code. Pretend that EIPV was set, and take the
+ * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
+ */
+static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
+{
+ if (bank != 0)
+ return;
+ if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
+ return;
+ if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
+ MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
+ MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
+ MCACOD)) !=
+ (MCI_STATUS_UC|MCI_STATUS_EN|
+ MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
+ MCI_STATUS_AR|MCACOD_INSTR))
+ return;
+
+ m->mcgstatus |= MCG_STATUS_EIPV;
+ m->ip = regs->ip;
+ m->cs = regs->cs;
+}
+
/* Add per CPU specific workarounds here */
-static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
+static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
{
+ struct mca_config *cfg = &mca_cfg;
+
if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
- pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
+ pr_info("unknown CPU type - not enabling MCE support\n");
return -EOPNOTSUPP;
}
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD) {
- if (c->x86 == 15 && banks > 4) {
+ if (c->x86 == 15 && cfg->banks > 4) {
/*
* disable GART TBL walk error reporting, which
* trips off incorrectly with the IOMMU & 3ware
@@ -1320,19 +1519,56 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
*/
clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
}
- if (c->x86 <= 17 && mce_bootlog < 0) {
+ if (c->x86 <= 17 && cfg->bootlog < 0) {
/*
* Lots of broken BIOS around that don't clear them
* by default and leave crap in there. Don't log:
*/
- mce_bootlog = 0;
+ cfg->bootlog = 0;
}
/*
* Various K7s with broken bank 0 around. Always disable
* by default.
*/
- if (c->x86 == 6 && banks > 0)
+ if (c->x86 == 6 && cfg->banks > 0)
mce_banks[0].ctl = 0;
+
+ /*
+ * Turn off MC4_MISC thresholding banks on those models since
+ * they're not supported there.
+ */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+ int i;
+ u64 val, hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
+ 0x00000413, /* MC4_MISC0 */
+ 0xc0000408, /* MC4_MISC1 */
+ };
+
+ rdmsrl(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+
+ for (i = 0; i < ARRAY_SIZE(msrs); i++) {
+ rdmsrl(msrs[i], val);
+
+ /* CntP bit set? */
+ if (val & BIT_64(62)) {
+ val &= ~BIT_64(62);
+ wrmsrl(msrs[i], val);
+ }
+ }
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+ }
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1345,7 +1581,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* valid event later, merely don't write CTL0.
*/
- if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
+ if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
mce_banks[0].init = 0;
/*
@@ -1353,36 +1589,44 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* synchronization with a one second timeout.
*/
if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
- monarch_timeout < 0)
- monarch_timeout = USEC_PER_SEC;
+ cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = USEC_PER_SEC;
/*
* There are also broken BIOSes on some Pentium M and
* earlier systems:
*/
- if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
- mce_bootlog = 0;
+ if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
+ cfg->bootlog = 0;
+
+ if (c->x86 == 6 && c->x86_model == 45)
+ quirk_no_way_out = quirk_sandybridge_ifu;
}
- if (monarch_timeout < 0)
- monarch_timeout = 0;
- if (mce_bootlog != 0)
- mce_panic_timeout = 30;
+ if (cfg->monarch_timeout < 0)
+ cfg->monarch_timeout = 0;
+ if (cfg->bootlog != 0)
+ cfg->panic_timeout = 30;
return 0;
}
-static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
- return;
+ return 0;
+
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
+ return 1;
break;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
+ return 1;
break;
}
+
+ return 0;
}
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
@@ -1390,6 +1634,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
+ mce_adjust_timer = mce_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
@@ -1399,27 +1644,32 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
}
}
-static void __mcheck_cpu_init_timer(void)
+static void mce_start_timer(unsigned int cpu, struct timer_list *t)
{
- struct timer_list *t = &__get_cpu_var(mce_timer);
- int *n = &__get_cpu_var(mce_next_interval);
-
- setup_timer(t, mce_start_timer, smp_processor_id());
+ unsigned long iv = check_interval * HZ;
- if (mce_ignore_ce)
+ if (mca_cfg.ignore_ce || !iv)
return;
- *n = check_interval * HZ;
- if (!*n)
- return;
- t->expires = round_jiffies(jiffies + *n);
- add_timer_on(t, smp_processor_id());
+ per_cpu(mce_next_interval, cpu) = iv;
+
+ t->expires = round_jiffies(jiffies + iv);
+ add_timer_on(t, cpu);
+}
+
+static void __mcheck_cpu_init_timer(void)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned int cpu = smp_processor_id();
+
+ setup_timer(t, mce_timer_fn, cpu);
+ mce_start_timer(cpu, t);
}
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
{
- printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+ pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
smp_processor_id());
}
@@ -1431,18 +1681,19 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off:
*/
-void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
+void mcheck_cpu_init(struct cpuinfo_x86 *c)
{
- if (mce_disabled)
+ if (mca_cfg.disabled)
return;
- __mcheck_cpu_ancient_init(c);
+ if (__mcheck_cpu_ancient_init(c))
+ return;
if (!mce_available(c))
return;
if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
- mce_disabled = 1;
+ mca_cfg.disabled = true;
return;
}
@@ -1452,44 +1703,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_timer();
INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
-
+ init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
}
/*
- * Character device to read and clear the MCE log.
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
*/
-static DEFINE_SPINLOCK(mce_state_lock);
-static int open_count; /* #times opened */
-static int open_exclu; /* already open exclusive? */
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count; /* #times opened */
+static int mce_chrdev_open_exclu; /* already open exclusive? */
-static int mce_open(struct inode *inode, struct file *file)
+static int mce_chrdev_open(struct inode *inode, struct file *file)
{
- spin_lock(&mce_state_lock);
+ spin_lock(&mce_chrdev_state_lock);
- if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
- spin_unlock(&mce_state_lock);
+ if (mce_chrdev_open_exclu ||
+ (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+ spin_unlock(&mce_chrdev_state_lock);
return -EBUSY;
}
if (file->f_flags & O_EXCL)
- open_exclu = 1;
- open_count++;
+ mce_chrdev_open_exclu = 1;
+ mce_chrdev_open_count++;
- spin_unlock(&mce_state_lock);
+ spin_unlock(&mce_chrdev_state_lock);
return nonseekable_open(inode, file);
}
-static int mce_release(struct inode *inode, struct file *file)
+static int mce_chrdev_release(struct inode *inode, struct file *file)
{
- spin_lock(&mce_state_lock);
+ spin_lock(&mce_chrdev_state_lock);
- open_count--;
- open_exclu = 0;
+ mce_chrdev_open_count--;
+ mce_chrdev_open_exclu = 0;
- spin_unlock(&mce_state_lock);
+ spin_unlock(&mce_chrdev_state_lock);
return 0;
}
@@ -1517,6 +1769,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
/* Error or no more MCE record */
if (rc <= 0) {
mce_apei_read_done = 1;
+ /*
+ * When ERST is disabled, mce_chrdev_read() should return
+ * "no record" instead of "no device."
+ */
+ if (rc == -ENODEV)
+ return 0;
return rc;
}
rc = -EFAULT;
@@ -1538,8 +1796,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
return 0;
}
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
- loff_t *off)
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+ size_t usize, loff_t *off)
{
char __user *buf = ubuf;
unsigned long *cpu_tsc;
@@ -1550,7 +1808,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
if (!cpu_tsc)
return -ENOMEM;
- mutex_lock(&mce_read_mutex);
+ mutex_lock(&mce_chrdev_read_mutex);
if (!mce_apei_read_done) {
err = __mce_read_apei(&buf, usize);
@@ -1570,19 +1828,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
do {
for (i = prev; i < next; i++) {
unsigned long start = jiffies;
+ struct mce *m = &mcelog.entry[i];
- while (!mcelog.entry[i].finished) {
+ while (!m->finished) {
if (time_after_eq(jiffies, start + 2)) {
- memset(mcelog.entry + i, 0,
- sizeof(struct mce));
+ memset(m, 0, sizeof(*m));
goto timeout;
}
cpu_relax();
}
smp_rmb();
- err |= copy_to_user(buf, mcelog.entry + i,
- sizeof(struct mce));
- buf += sizeof(struct mce);
+ err |= copy_to_user(buf, m, sizeof(*m));
+ buf += sizeof(*m);
timeout:
;
}
@@ -1602,13 +1859,13 @@ timeout:
on_each_cpu(collect_tscs, cpu_tsc, 1);
for (i = next; i < MCE_LOG_LEN; i++) {
- if (mcelog.entry[i].finished &&
- mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
- err |= copy_to_user(buf, mcelog.entry+i,
- sizeof(struct mce));
+ struct mce *m = &mcelog.entry[i];
+
+ if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+ err |= copy_to_user(buf, m, sizeof(*m));
smp_rmb();
- buf += sizeof(struct mce);
- memset(&mcelog.entry[i], 0, sizeof(struct mce));
+ buf += sizeof(*m);
+ memset(m, 0, sizeof(*m));
}
}
@@ -1616,23 +1873,24 @@ timeout:
err = -EFAULT;
out:
- mutex_unlock(&mce_read_mutex);
+ mutex_unlock(&mce_chrdev_read_mutex);
kfree(cpu_tsc);
return err ? err : buf - ubuf;
}
-static unsigned int mce_poll(struct file *file, poll_table *wait)
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
{
- poll_wait(file, &mce_wait, wait);
- if (rcu_dereference_check_mce(mcelog.next))
+ poll_wait(file, &mce_chrdev_wait, wait);
+ if (rcu_access_index(mcelog.next))
return POLLIN | POLLRDNORM;
if (!mce_apei_read_done && apei_check_mce())
return POLLIN | POLLRDNORM;
return 0;
}
-static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+ unsigned long arg)
{
int __user *p = (int __user *)arg;
@@ -1658,23 +1916,61 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
}
}
-/* Modified in mce-inject.c, so not static or const */
-struct file_operations mce_chrdev_ops = {
- .open = mce_open,
- .release = mce_release,
- .read = mce_read,
- .poll = mce_poll,
- .unlocked_ioctl = mce_ioctl,
- .llseek = no_llseek,
+static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off);
+
+void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
+ const char __user *ubuf,
+ size_t usize, loff_t *off))
+{
+ mce_write = fn;
+}
+EXPORT_SYMBOL_GPL(register_mce_write_callback);
+
+ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ if (mce_write)
+ return mce_write(filp, ubuf, usize, off);
+ else
+ return -EINVAL;
+}
+
+static const struct file_operations mce_chrdev_ops = {
+ .open = mce_chrdev_open,
+ .release = mce_chrdev_release,
+ .read = mce_chrdev_read,
+ .write = mce_chrdev_write,
+ .poll = mce_chrdev_poll,
+ .unlocked_ioctl = mce_chrdev_ioctl,
+ .llseek = no_llseek,
};
-EXPORT_SYMBOL_GPL(mce_chrdev_ops);
-static struct miscdevice mce_log_device = {
+static struct miscdevice mce_chrdev_device = {
MISC_MCELOG_MINOR,
"mcelog",
&mce_chrdev_ops,
};
+static void __mce_disable_bank(void *arg)
+{
+ int bank = *((int *)arg);
+ __clear_bit(bank, __get_cpu_var(mce_poll_banks));
+ cmci_disable_bank(bank);
+}
+
+void mce_disable_bank(int bank)
+{
+ if (bank >= mca_cfg.banks) {
+ pr_warn(FW_BUG
+ "Ignoring request to disable invalid MCA bank %d.\n",
+ bank);
+ return;
+ }
+ set_bit(bank, mce_banks_ce_disabled);
+ on_each_cpu(__mce_disable_bank, &bank, 1);
+}
+
/*
* mce=off Disables machine check
* mce=no_cmci Disables CMCI
@@ -1685,9 +1981,12 @@ static struct miscdevice mce_log_device = {
* check, or 0 to not wait
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
+ * mce=bios_cmci_threshold Don't program the CMCI threshold
*/
static int __init mcheck_enable(char *str)
{
+ struct mca_config *cfg = &mca_cfg;
+
if (*str == 0) {
enable_p5_mce();
return 1;
@@ -1695,24 +1994,25 @@ static int __init mcheck_enable(char *str)
if (*str == '=')
str++;
if (!strcmp(str, "off"))
- mce_disabled = 1;
+ cfg->disabled = true;
else if (!strcmp(str, "no_cmci"))
- mce_cmci_disabled = 1;
+ cfg->cmci_disabled = true;
else if (!strcmp(str, "dont_log_ce"))
- mce_dont_log_ce = 1;
+ cfg->dont_log_ce = true;
else if (!strcmp(str, "ignore_ce"))
- mce_ignore_ce = 1;
+ cfg->ignore_ce = true;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
- mce_bootlog = (str[0] == 'b');
+ cfg->bootlog = (str[0] == 'b');
+ else if (!strcmp(str, "bios_cmci_threshold"))
+ cfg->bios_cmci_threshold = true;
else if (isdigit(str[0])) {
- get_option(&str, &tolerant);
+ get_option(&str, &(cfg->tolerant));
if (*str == ',') {
++str;
- get_option(&str, &monarch_timeout);
+ get_option(&str, &(cfg->monarch_timeout));
}
} else {
- printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
- str);
+ pr_info("mce argument %s ignored. Please use /sys\n", str);
return 0;
}
return 1;
@@ -1721,15 +2021,13 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
- atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
-
mcheck_intel_therm_init();
return 0;
}
/*
- * Sysfs support
+ * mce_syscore: PM support
*/
/*
@@ -1740,7 +2038,7 @@ static int mce_disable_error_reporting(void)
{
int i;
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
@@ -1749,14 +2047,14 @@ static int mce_disable_error_reporting(void)
return 0;
}
-static int mce_suspend(struct sys_device *dev, pm_message_t state)
+static int mce_syscore_suspend(void)
{
return mce_disable_error_reporting();
}
-static int mce_shutdown(struct sys_device *dev)
+static void mce_syscore_shutdown(void)
{
- return mce_disable_error_reporting();
+ mce_disable_error_reporting();
}
/*
@@ -1764,17 +2062,24 @@ static int mce_shutdown(struct sys_device *dev)
* Only one CPU is active at this time, the others get re-added later using
* CPU hotplug:
*/
-static int mce_resume(struct sys_device *dev)
+static void mce_syscore_resume(void)
{
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
-
- return 0;
}
+static struct syscore_ops mce_syscore_ops = {
+ .suspend = mce_syscore_suspend,
+ .shutdown = mce_syscore_shutdown,
+ .resume = mce_syscore_resume,
+};
+
+/*
+ * mce_device: Sysfs support
+ */
+
static void mce_cpu_restart(void *data)
{
- del_timer_sync(&__get_cpu_var(mce_timer));
if (!mce_available(__this_cpu_ptr(&cpu_info)))
return;
__mcheck_cpu_init_generic();
@@ -1784,16 +2089,15 @@ static void mce_cpu_restart(void *data)
/* Reinit MCEs after user configuration changes */
static void mce_restart(void)
{
+ mce_timer_delete_all();
on_each_cpu(mce_cpu_restart, NULL, 1);
}
/* Toggle features for corrected errors */
-static void mce_disable_ce(void *all)
+static void mce_disable_cmci(void *data)
{
if (!mce_available(__this_cpu_ptr(&cpu_info)))
return;
- if (all)
- del_timer_sync(&__get_cpu_var(mce_timer));
cmci_clear();
}
@@ -1807,30 +2111,27 @@ static void mce_enable_ce(void *all)
__mcheck_cpu_init_timer();
}
-static struct sysdev_class mce_sysclass = {
- .suspend = mce_suspend,
- .shutdown = mce_shutdown,
- .resume = mce_resume,
+static struct bus_type mce_subsys = {
.name = "machinecheck",
+ .dev_name = "machinecheck",
};
-DEFINE_PER_CPU(struct sys_device, mce_dev);
+DEFINE_PER_CPU(struct device *, mce_device);
-__cpuinitdata
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
+static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
{
return container_of(attr, struct mce_bank, attr);
}
-static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
}
-static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
@@ -1845,14 +2146,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
}
static ssize_t
-show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
{
strcpy(buf, mce_helper);
strcat(buf, "\n");
return strlen(mce_helper) + 1;
}
-static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
const char *buf, size_t siz)
{
char *p;
@@ -1867,8 +2168,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
return strlen(mce_helper) + !!p;
}
-static ssize_t set_ignore_ce(struct sys_device *s,
- struct sysdev_attribute *attr,
+static ssize_t set_ignore_ce(struct device *s,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
@@ -1876,22 +2177,23 @@ static ssize_t set_ignore_ce(struct sys_device *s,
if (strict_strtoull(buf, 0, &new) < 0)
return -EINVAL;
- if (mce_ignore_ce ^ !!new) {
+ if (mca_cfg.ignore_ce ^ !!new) {
if (new) {
/* disable ce features */
- on_each_cpu(mce_disable_ce, (void *)1, 1);
- mce_ignore_ce = 1;
+ mce_timer_delete_all();
+ on_each_cpu(mce_disable_cmci, NULL, 1);
+ mca_cfg.ignore_ce = true;
} else {
/* enable ce features */
- mce_ignore_ce = 0;
+ mca_cfg.ignore_ce = false;
on_each_cpu(mce_enable_ce, (void *)1, 1);
}
}
return size;
}
-static ssize_t set_cmci_disabled(struct sys_device *s,
- struct sysdev_attribute *attr,
+static ssize_t set_cmci_disabled(struct device *s,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
@@ -1899,125 +2201,137 @@ static ssize_t set_cmci_disabled(struct sys_device *s,
if (strict_strtoull(buf, 0, &new) < 0)
return -EINVAL;
- if (mce_cmci_disabled ^ !!new) {
+ if (mca_cfg.cmci_disabled ^ !!new) {
if (new) {
/* disable cmci */
- on_each_cpu(mce_disable_ce, NULL, 1);
- mce_cmci_disabled = 1;
+ on_each_cpu(mce_disable_cmci, NULL, 1);
+ mca_cfg.cmci_disabled = true;
} else {
/* enable cmci */
- mce_cmci_disabled = 0;
+ mca_cfg.cmci_disabled = false;
on_each_cpu(mce_enable_ce, NULL, 1);
}
}
return size;
}
-static ssize_t store_int_with_restart(struct sys_device *s,
- struct sysdev_attribute *attr,
+static ssize_t store_int_with_restart(struct device *s,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
- ssize_t ret = sysdev_store_int(s, attr, buf, size);
+ ssize_t ret = device_store_int(s, attr, buf, size);
mce_restart();
return ret;
}
-static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
-static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
-static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
+static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
+static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
+static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
-static struct sysdev_ext_attribute attr_check_interval = {
- _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
- store_int_with_restart),
+static struct dev_ext_attribute dev_attr_check_interval = {
+ __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
&check_interval
};
-static struct sysdev_ext_attribute attr_ignore_ce = {
- _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
- &mce_ignore_ce
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+ __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
+ &mca_cfg.ignore_ce
};
-static struct sysdev_ext_attribute attr_cmci_disabled = {
- _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
- &mce_cmci_disabled
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+ __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
+ &mca_cfg.cmci_disabled
};
-static struct sysdev_attribute *mce_attrs[] = {
- &attr_tolerant.attr,
- &attr_check_interval.attr,
- &attr_trigger,
- &attr_monarch_timeout.attr,
- &attr_dont_log_ce.attr,
- &attr_ignore_ce.attr,
- &attr_cmci_disabled.attr,
+static struct device_attribute *mce_device_attrs[] = {
+ &dev_attr_tolerant.attr,
+ &dev_attr_check_interval.attr,
+ &dev_attr_trigger,
+ &dev_attr_monarch_timeout.attr,
+ &dev_attr_dont_log_ce.attr,
+ &dev_attr_ignore_ce.attr,
+ &dev_attr_cmci_disabled.attr,
NULL
};
-static cpumask_var_t mce_dev_initialized;
+static cpumask_var_t mce_device_initialized;
+
+static void mce_device_release(struct device *dev)
+{
+ kfree(dev);
+}
-/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_create_device(unsigned int cpu)
+/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+static int mce_device_create(unsigned int cpu)
{
+ struct device *dev;
int err;
int i, j;
if (!mce_available(&boot_cpu_data))
return -EIO;
- memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
- per_cpu(mce_dev, cpu).id = cpu;
- per_cpu(mce_dev, cpu).cls = &mce_sysclass;
+ dev = kzalloc(sizeof *dev, GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ dev->id = cpu;
+ dev->bus = &mce_subsys;
+ dev->release = &mce_device_release;
- err = sysdev_register(&per_cpu(mce_dev, cpu));
- if (err)
+ err = device_register(dev);
+ if (err) {
+ put_device(dev);
return err;
+ }
- for (i = 0; mce_attrs[i]; i++) {
- err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ for (i = 0; mce_device_attrs[i]; i++) {
+ err = device_create_file(dev, mce_device_attrs[i]);
if (err)
goto error;
}
- for (j = 0; j < banks; j++) {
- err = sysdev_create_file(&per_cpu(mce_dev, cpu),
- &mce_banks[j].attr);
+ for (j = 0; j < mca_cfg.banks; j++) {
+ err = device_create_file(dev, &mce_banks[j].attr);
if (err)
goto error2;
}
- cpumask_set_cpu(cpu, mce_dev_initialized);
+ cpumask_set_cpu(cpu, mce_device_initialized);
+ per_cpu(mce_device, cpu) = dev;
return 0;
error2:
while (--j >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
+ device_remove_file(dev, &mce_banks[j].attr);
error:
while (--i >= 0)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ device_remove_file(dev, mce_device_attrs[i]);
- sysdev_unregister(&per_cpu(mce_dev, cpu));
+ device_unregister(dev);
return err;
}
-static __cpuinit void mce_remove_device(unsigned int cpu)
+static void mce_device_remove(unsigned int cpu)
{
+ struct device *dev = per_cpu(mce_device, cpu);
int i;
- if (!cpumask_test_cpu(cpu, mce_dev_initialized))
+ if (!cpumask_test_cpu(cpu, mce_device_initialized))
return;
- for (i = 0; mce_attrs[i]; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+ for (i = 0; mce_device_attrs[i]; i++)
+ device_remove_file(dev, mce_device_attrs[i]);
- for (i = 0; i < banks; i++)
- sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
+ for (i = 0; i < mca_cfg.banks; i++)
+ device_remove_file(dev, &mce_banks[i].attr);
- sysdev_unregister(&per_cpu(mce_dev, cpu));
- cpumask_clear_cpu(cpu, mce_dev_initialized);
+ device_unregister(dev);
+ cpumask_clear_cpu(cpu, mce_device_initialized);
+ per_cpu(mce_device, cpu) = NULL;
}
/* Make sure there are no machine checks on offlined CPUs. */
-static void __cpuinit mce_disable_cpu(void *h)
+static void mce_disable_cpu(void *h)
{
unsigned long action = *(unsigned long *)h;
int i;
@@ -2027,7 +2341,7 @@ static void __cpuinit mce_disable_cpu(void *h)
if (!(action & CPU_TASKS_FROZEN))
cmci_clear();
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
@@ -2035,7 +2349,7 @@ static void __cpuinit mce_disable_cpu(void *h)
}
}
-static void __cpuinit mce_reenable_cpu(void *h)
+static void mce_reenable_cpu(void *h)
{
unsigned long action = *(unsigned long *)h;
int i;
@@ -2045,7 +2359,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
if (!(action & CPU_TASKS_FROZEN))
cmci_reenable();
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
if (b->init)
@@ -2054,48 +2368,43 @@ static void __cpuinit mce_reenable_cpu(void *h)
}
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int __cpuinit
+static int
mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
struct timer_list *t = &per_cpu(mce_timer, cpu);
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- mce_create_device(cpu);
+ mce_device_create(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
break;
case CPU_DEAD:
- case CPU_DEAD_FROZEN:
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
- mce_remove_device(cpu);
+ mce_device_remove(cpu);
+ mce_intel_hcpu_update(cpu);
break;
case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- del_timer_sync(t);
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ del_timer_sync(t);
break;
case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- if (!mce_ignore_ce && check_interval) {
- t->expires = round_jiffies(jiffies +
- __get_cpu_var(mce_next_interval));
- add_timer_on(t, cpu);
- }
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ mce_start_timer(cpu, t);
break;
- case CPU_POST_DEAD:
+ }
+
+ if (action == CPU_POST_DEAD) {
/* intentionally ignoring frozen here */
- cmci_rediscover(cpu);
- break;
+ cmci_rediscover();
}
+
return NOTIFY_OK;
}
-static struct notifier_block mce_cpu_notifier __cpuinitdata = {
+static struct notifier_block mce_cpu_notifier = {
.notifier_call = mce_cpu_callback,
};
@@ -2103,9 +2412,9 @@ static __init void mce_init_banks(void)
{
int i;
- for (i = 0; i < banks; i++) {
+ for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
- struct sysdev_attribute *a = &b->attr;
+ struct device_attribute *a = &b->attr;
sysfs_attr_init(&a->attr);
a->attr.name = b->attrname;
@@ -2122,37 +2431,78 @@ static __init int mcheck_init_device(void)
int err;
int i = 0;
- if (!mce_available(&boot_cpu_data))
- return -EIO;
+ if (!mce_available(&boot_cpu_data)) {
+ err = -EIO;
+ goto err_out;
+ }
- zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
+ if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
mce_init_banks();
- err = sysdev_class_register(&mce_sysclass);
+ err = subsys_system_register(&mce_subsys, NULL);
if (err)
- return err;
+ goto err_out_mem;
+ cpu_notifier_register_begin();
for_each_online_cpu(i) {
- err = mce_create_device(i);
- if (err)
- return err;
+ err = mce_device_create(i);
+ if (err) {
+ /*
+ * Register notifier anyway (and do not unreg it) so
+ * that we don't leave undeleted timers, see notifier
+ * callback above.
+ */
+ __register_hotcpu_notifier(&mce_cpu_notifier);
+ cpu_notifier_register_done();
+ goto err_device_create;
+ }
}
- register_hotcpu_notifier(&mce_cpu_notifier);
- misc_register(&mce_log_device);
+ __register_hotcpu_notifier(&mce_cpu_notifier);
+ cpu_notifier_register_done();
+
+ register_syscore_ops(&mce_syscore_ops);
+
+ /* register character device /dev/mcelog */
+ err = misc_register(&mce_chrdev_device);
+ if (err)
+ goto err_register;
+
+ return 0;
+
+err_register:
+ unregister_syscore_ops(&mce_syscore_ops);
+
+err_device_create:
+ /*
+ * We didn't keep track of which devices were created above, but
+ * even if we had, the set of online cpus might have changed.
+ * Play safe and remove for every possible cpu, since
+ * mce_device_remove() will do the right thing.
+ */
+ for_each_possible_cpu(i)
+ mce_device_remove(i);
+
+err_out_mem:
+ free_cpumask_var(mce_device_initialized);
+
+err_out:
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
return err;
}
-
-device_initcall(mcheck_init_device);
+device_initcall_sync(mcheck_init_device);
/*
* Old style boot options parsing. Only for compatibility.
*/
static int __init mcheck_disable(char *str)
{
- mce_disabled = 1;
+ mca_cfg.disabled = true;
return 1;
}
__setup("nomce", mcheck_disable);