diff options
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck')
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-apei.c | 57 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-inject.c | 65 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-internal.h | 21 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce-severity.c | 196 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 1310 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce_amd.c | 518 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce_intel.c | 274 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/p5.c | 4 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/therm_throt.c | 296 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/threshold.c | 24 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/winchip.c | 4 |
11 files changed, 1783 insertions, 986 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 8209472b27a..a1aef953315 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -28,26 +28,33 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/export.h> #include <linux/kernel.h> #include <linux/acpi.h> #include <linux/cper.h> #include <acpi/apei.h> +#include <acpi/ghes.h> #include <asm/mce.h> #include "mce-internal.h" -void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) +void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { struct mce m; - /* Only corrected MC is reported */ - if (!corrected) + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; mce_setup(&m); m.bank = 1; - /* Fake a memory read corrected error with unknown channel */ + /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + + if (severity >= GHES_SEV_RECOVERABLE) + m.status |= MCI_STATUS_UC; + if (severity >= GHES_SEV_PANIC) + m.status |= MCI_STATUS_PCC; + m.addr = mem_err->physical_addr; mce_log(&m); mce_notify_irq(); @@ -106,24 +113,34 @@ int apei_write_mce(struct mce *m) ssize_t apei_read_mce(struct mce *m, u64 *record_id) { struct cper_mce_record rcd; - ssize_t len; - - len = erst_read_next(&rcd.hdr, sizeof(rcd)); - if (len <= 0) - return len; - /* Can not skip other records in storage via ERST unless clear them */ - else if (len != sizeof(rcd) || - uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { - if (printk_ratelimit()) - pr_warning( - "MCE-APEI: Can not skip the unknown record in ERST"); - return -EIO; - } - + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + goto retry; memcpy(m, &rcd.mce, sizeof(*m)); - *record_id = rcd.hdr.record_id; + rc = sizeof(*m); +out: + erst_get_record_id_end(); - return sizeof(*m); + return rc; } /* Check whether there is record in ERST */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index e7dbde7bfed..5ac2d1fb28b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/fs.h> +#include <linux/preempt.h> #include <linux/smp.h> #include <linux/notifier.h> #include <linux/kdebug.h> @@ -25,13 +26,14 @@ #include <linux/gfp.h> #include <asm/mce.h> #include <asm/apic.h> +#include <asm/nmi.h> /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) { struct mce *i = &per_cpu(injectm, m->extcpu); - /* Make sure noone reads partially written injectm */ + /* Make sure no one reads partially written injectm */ i->finished = 0; mb(); m->finished = 0; @@ -76,27 +78,33 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) } static cpumask_var_t mce_inject_cpumask; +static DEFINE_MUTEX(mce_inject_mutex); -static int mce_raise_notify(struct notifier_block *self, - unsigned long val, void *data) +static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) { - struct die_args *args = (struct die_args *)data; int cpu = smp_processor_id(); struct mce *m = &__get_cpu_var(injectm); - if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) - return NOTIFY_DONE; + if (!cpumask_test_cpu(cpu, mce_inject_cpumask)) + return NMI_DONE; cpumask_clear_cpu(cpu, mce_inject_cpumask); if (m->inject_flags & MCJ_EXCEPTION) - raise_exception(m, args->regs); + raise_exception(m, regs); else if (m->status) raise_poll(m); - return NOTIFY_STOP; + return NMI_HANDLED; } -static struct notifier_block mce_raise_nb = { - .notifier_call = mce_raise_notify, - .priority = 1000, -}; +static void mce_irq_ipi(void *info) +{ + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + + if (cpumask_test_cpu(cpu, mce_inject_cpumask) && + m->inject_flags & MCJ_EXCEPTION) { + cpumask_clear_cpu(cpu, mce_inject_cpumask); + raise_exception(m, NULL); + } +} /* Inject mce on current CPU */ static int raise_local(void) @@ -145,9 +153,10 @@ static void raise_mce(struct mce *m) return; #ifdef CONFIG_X86_LOCAL_APIC - if (m->inject_flags & MCJ_NMI_BROADCAST) { + if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { unsigned long start; int cpu; + get_online_cpus(); cpumask_copy(mce_inject_cpumask, cpu_online_mask); cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); @@ -157,13 +166,25 @@ static void raise_mce(struct mce *m) MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) cpumask_clear_cpu(cpu, mce_inject_cpumask); } - if (!cpumask_empty(mce_inject_cpumask)) - apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); + if (!cpumask_empty(mce_inject_cpumask)) { + if (m->inject_flags & MCJ_IRQ_BROADCAST) { + /* + * don't wait because mce_irq_ipi is necessary + * to be sync with following raise_local + */ + preempt_disable(); + smp_call_function_many(mce_inject_cpumask, + mce_irq_ipi, NULL, 0); + preempt_enable(); + } else if (m->inject_flags & MCJ_NMI_BROADCAST) + apic->send_IPI_mask(mce_inject_cpumask, + NMI_VECTOR); + } start = jiffies; while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { printk(KERN_ERR - "Timeout waiting for mce inject NMI %lx\n", + "Timeout waiting for mce inject %lx\n", *cpumask_bits(mce_inject_cpumask)); break; } @@ -174,7 +195,11 @@ static void raise_mce(struct mce *m) put_online_cpus(); } else #endif + { + preempt_disable(); raise_local(); + preempt_enable(); + } } /* Error injection interface */ @@ -205,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, * so do it a jiffie or two later everywhere. */ schedule_timeout(2); + + mutex_lock(&mce_inject_mutex); raise_mce(&m); + mutex_unlock(&mce_inject_mutex); return usize; } @@ -214,8 +242,9 @@ static int inject_init(void) if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; printk(KERN_INFO "Machine check injector initialized\n"); - mce_chrdev_ops.write = mce_write; - register_die_notifier(&mce_raise_nb); + register_mce_write_callback(mce_write); + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, + "mce_notify"); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fefcc69ee8b..09edd0b65fe 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,4 +1,4 @@ -#include <linux/sysdev.h> +#include <linux/device.h> #include <asm/mce.h> enum severity_level { @@ -17,16 +17,29 @@ enum severity_level { struct mce_bank { u64 ctl; /* subevents to enable */ unsigned char init; /* initialise bank? */ - struct sysdev_attribute attr; /* sysdev attribute */ + struct device_attribute attr; /* device attribute */ char attrname[ATTR_LEN]; /* attribute name */ }; int mce_severity(struct mce *a, int tolerant, char **msg); struct dentry *mce_get_debugfs_dir(void); -extern int mce_ser; - extern struct mce_bank *mce_banks; +extern mce_banks_t mce_banks_ce_disabled; + +#ifdef CONFIG_X86_MCE_INTEL +unsigned long mce_intel_adjust_timer(unsigned long interval); +void mce_intel_cmci_poll(void); +void mce_intel_hcpu_update(unsigned long cpu); +void cmci_disable_bank(int bank); +#else +# define mce_intel_adjust_timer mce_adjust_timer_default +static inline void mce_intel_cmci_poll(void) { } +static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void cmci_disable_bank(int bank) { } +#endif + +void mce_timer_kick(unsigned long interval); #ifdef CONFIG_ACPI_APEI int apei_write_mce(struct mce *m); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 1e8d66c1336..c370e1c4468 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -43,88 +43,154 @@ static struct severity { unsigned char covered; char *msg; } severities[] = { -#define KERNEL .context = IN_KERNEL -#define USER .context = IN_USER -#define SER .ser = SER_REQUIRED -#define NOSER .ser = NO_SER -#define SEV(s) .sev = MCE_ ## s ## _SEVERITY -#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } -#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } -#define MCGMASK(x, res, s, m, r...) \ - { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } -#define MASK(x, y, s, m, r...) \ - { .mask = x, .result = y, SEV(s), .msg = m, ## r } +#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER +#define BITCLR(x) .mask = x, .result = 0 +#define BITSET(x) .mask = x, .result = x +#define MCGMASK(x, y) .mcgmask = x, .mcgres = y +#define MASK(x, y) .mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) -#define MCACOD 0xffff +#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) - BITCLR(MCI_STATUS_VAL, NO, "Invalid"), - BITCLR(MCI_STATUS_EN, NO, "Not enabled"), - BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), + MCESEV( + NO, "Invalid", + BITCLR(MCI_STATUS_VAL) + ), + MCESEV( + NO, "Not enabled", + BITCLR(MCI_STATUS_EN) + ), + MCESEV( + PANIC, "Processor context corrupt", + BITSET(MCI_STATUS_PCC) + ), /* When MCIP is not set something is very confused */ - MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), + MCESEV( + PANIC, "MCIP not set in MCA handler", + MCGMASK(MCG_STATUS_MCIP, 0) + ), /* Neither return not error IP -- no chance to recover -> PANIC */ - MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, - "Neither restart nor error IP"), - MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", - KERNEL), - BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), - MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, - "Spurious not enabled", SER), + MCESEV( + PANIC, "Neither restart nor error IP", + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) + ), + MCESEV( + PANIC, "In kernel and no restart IP", + KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + KEEP, "Corrected error", + NOSER, BITCLR(MCI_STATUS_UC) + ), /* ignore OVER for UCNA */ - MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, - "Uncorrected no action required", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, - "Illegal combination (UCNA with AR=1)", SER), - MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), - - /* AR add known MCACODs here */ - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, - "Action required with lost events", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, - "Action required; unknown MCACOD", SER), + MCESEV( + KEEP, "Uncorrected no action required", + SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) + ), + MCESEV( + PANIC, "Illegal combination (UCNA with AR=1)", + SER, + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) + ), + MCESEV( + KEEP, "Non signalled machine check", + SER, BITCLR(MCI_STATUS_S) + ), + + MCESEV( + PANIC, "Action required with lost events", + SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) + ), + + /* known AR MCACODs: */ +#ifdef CONFIG_MEMORY_FAILURE + MCESEV( + KEEP, "Action required but unaffected thread is continuable", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR), + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) + ), + MCESEV( + AR, "Action required: data load error in a user process", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + USER + ), + MCESEV( + AR, "Action required: instruction fetch error in a user process", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + USER + ), +#endif + MCESEV( + PANIC, "Action required: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) + ), /* known AO MCACODs: */ - MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, - "Action optional: memory scrubbing error", SER), - MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, - "Action optional: last level cache writeback error", SER), - - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, - "Action optional unknown MCACOD", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, - "Action optional with lost events", SER), - BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), - BITSET(MCI_STATUS_UC, UC, "Uncorrected"), - BITSET(0, SOME, "No match") /* always matches. keep at end */ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) + ), + MCESEV( + SOME, "Action optional: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) + ), + MCESEV( + SOME, "Action optional with lost events", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S) + ), + + MCESEV( + PANIC, "Overflowed uncorrected", + BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) + ), + MCESEV( + UC, "Uncorrected", + BITSET(MCI_STATUS_UC) + ), + MCESEV( + SOME, "No match", + BITSET(0) + ) /* always matches. keep at end */ }; /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. + * If mcgstatus indicated that ip/cs on the stack were + * no good, then "m->cs" will be zero and we will have + * to assume the worst case (IN_KERNEL) as we actually + * have no idea what we were executing when the machine + * check hit. + * If we do have a good "m->cs" (or a faked one in the + * case we were executing in VM86 mode) we can use it to + * distinguish an exception taken in user from from one + * taken in the kernel. */ static int error_context(struct mce *m) { - if (m->mcgstatus & MCG_STATUS_EIPV) - return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; - /* Unknown, assume kernel */ - return IN_KERNEL; + return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } -int mce_severity(struct mce *a, int tolerant, char **msg) +int mce_severity(struct mce *m, int tolerant, char **msg) { - enum context ctx = error_context(a); + enum context ctx = error_context(m); struct severity *s; for (s = severities;; s++) { - if ((a->status & s->mask) != s->result) + if ((m->status & s->mask) != s->result) continue; - if ((a->mcgstatus & s->mcgmask) != s->mcgres) + if ((m->mcgstatus & s->mcgmask) != s->mcgres) continue; - if (s->ser == SER_REQUIRED && !mce_ser) + if (s->ser == SER_REQUIRED && !mca_cfg.ser) continue; - if (s->ser == NO_SER && mce_ser) + if (s->ser == NO_SER && mca_cfg.ser) continue; if (s->context && ctx != s->context) continue; @@ -197,15 +263,15 @@ static const struct file_operations severities_coverage_fops = { static int __init severities_debugfs_init(void) { - struct dentry *dmce = NULL, *fseverities_coverage = NULL; + struct dentry *dmce, *fsev; dmce = mce_get_debugfs_dir(); - if (dmce == NULL) + if (!dmce) goto err_out; - fseverities_coverage = debugfs_create_file("severities-coverage", - 0444, dmce, NULL, - &severities_coverage_fops); - if (fseverities_coverage == NULL) + + fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL, + &severities_coverage_fops); + if (!fsev) goto err_out; return 0; @@ -214,4 +280,4 @@ err_out: return -ENOMEM; } late_initcall(severities_debugfs_init); -#endif +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7a35b72d7c0..9a79c8dbd8e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -7,10 +7,12 @@ * Copyright 2008 Intel Corporation * Author: Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/thread_info.h> #include <linux/capability.h> #include <linux/miscdevice.h> -#include <linux/interrupt.h> #include <linux/ratelimit.h> #include <linux/kallsyms.h> #include <linux/rcupdate.h> @@ -20,7 +22,8 @@ #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/string.h> -#include <linux/sysdev.h> +#include <linux/device.h> +#include <linux/syscore_ops.h> #include <linux/delay.h> #include <linux/ctype.h> #include <linux/sched.h> @@ -36,95 +39,84 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/debugfs.h> -#include <linux/edac_mce.h> +#include <linux/irq_work.h> +#include <linux/export.h> #include <asm/processor.h> -#include <asm/hw_irq.h> -#include <asm/apic.h> -#include <asm/idle.h> -#include <asm/ipi.h> #include <asm/mce.h> #include <asm/msr.h> #include "mce-internal.h" -static DEFINE_MUTEX(mce_read_mutex); +static DEFINE_MUTEX(mce_chrdev_read_mutex); #define rcu_dereference_check_mce(p) \ rcu_dereference_index_check((p), \ rcu_read_lock_sched_held() || \ - lockdep_is_held(&mce_read_mutex)) + lockdep_is_held(&mce_chrdev_read_mutex)) #define CREATE_TRACE_POINTS #include <trace/events/mce.h> -int mce_disabled __read_mostly; - -#define MISC_MCELOG_MINOR 227 - #define SPINUNIT 100 /* 100ns */ -atomic_t mce_entry; - DEFINE_PER_CPU(unsigned, mce_exception_count); -/* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ -static int tolerant __read_mostly = 1; -static int banks __read_mostly; -static int rip_msr __read_mostly; -static int mce_bootlog __read_mostly = -1; -static int monarch_timeout __read_mostly = -1; -static int mce_panic_timeout __read_mostly; -static int mce_dont_log_ce __read_mostly; -int mce_cmci_disabled __read_mostly; -int mce_ignore_ce __read_mostly; -int mce_ser __read_mostly; - -struct mce_bank *mce_banks __read_mostly; +struct mce_bank *mce_banks __read_mostly; + +struct mca_config mca_cfg __read_mostly = { + .bootlog = -1, + /* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ + .tolerant = 1, + .monarch_timeout = -1 +}; /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); + static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +/* CMCI storm detection filter */ +static DEFINE_PER_CPU(unsigned long, mce_polled_error); + /* - * CPU/chipset specific EDAC code can register a notifier call here to print - * MCE errors in a human-readable form. + * MCA banks polled by the period polling timer for corrected events. + * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); -EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); - -static int default_decode_mce(struct notifier_block *nb, unsigned long val, - void *data) -{ - pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); - pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); - - return NOTIFY_STOP; -} - -static struct notifier_block mce_dec_nb = { - .notifier_call = default_decode_mce, - .priority = -1, -}; - -/* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; +/* + * MCA banks controlled through firmware first for corrected errors. + * This is a global list of banks for which we won't enable CMCI and we + * won't poll. Firmware controls these banks and is responsible for + * reporting corrected errors through GHES. Uncorrected/recoverable + * errors are still notified through a machine check. + */ +mce_banks_t mce_banks_ce_disabled; + static DEFINE_PER_CPU(struct work_struct, mce_work); +static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); + +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -135,9 +127,7 @@ void mce_setup(struct mce *m) m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); -#ifdef CONFIG_SMP m->socketid = cpu_data(m->extcpu).phys_proc_id; -#endif m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } @@ -160,23 +150,20 @@ static struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + int ret = 0; /* Emit the trace record: */ trace_mce_record(mce); + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + if (ret == NOTIFY_STOP) + return; + mce->finished = 0; wmb(); for (;;) { entry = rcu_dereference_check_mce(mcelog.next); for (;;) { - /* - * If edac_mce is enabled, it will check the error type - * and will process it, if it is a known error. - * Otherwise, the error will be sent through mcelog - * interface - */ - if (edac_mce_parse(mce)) - return; /* * When the buffer fills up discard new entries. @@ -209,8 +196,61 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +static void drain_mcelog_buffer(void) +{ + unsigned int next, i, prev = 0; + + next = ACCESS_ONCE(mcelog.next); + + do { + struct mce *m; + + /* drain what was logged during boot */ + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + unsigned retries = 1; + + m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2*retries)) + retries++; + + cpu_relax(); + + if (!m->finished && retries >= 4) { + pr_err("skipping error being logged currently!\n"); + break; + } + } + smp_rmb(); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + } + + memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); +} + + +void mce_register_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + drain_mcelog_buffer(); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); + static void print_mce(struct mce *m) { + int ret = 0; + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); @@ -231,14 +271,23 @@ static void print_mce(struct mce *m) pr_cont("MISC %llx ", m->misc); pr_cont("\n"); - pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", - m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + /* + * Note this output is parsed by external tools and old fields + * should not be changed. + */ + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, + cpu_data(m->extcpu).microcode); /* * Print out human-readable details about the MCE error, * (if the CPU has an implementation for that) */ - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + if (ret == NOTIFY_STOP) + return; + + pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -258,7 +307,7 @@ static void wait_for_panic(void) while (timeout-- > 0) udelay(1); if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; + panic_timeout = mca_cfg.panic_timeout; panic("Panicing machine check CPU died"); } @@ -316,7 +365,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; + panic_timeout = mca_cfg.panic_timeout; panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); @@ -326,9 +375,9 @@ static void mce_panic(char *msg, struct mce *final, char *exp) static int msr_to_offset(u32 msr) { - unsigned bank = __get_cpu_var(injectm.bank); + unsigned bank = __this_cpu_read(injectm.bank); - if (msr == rip_msr) + if (msr == mca_cfg.rip_msr) return offsetof(struct mce, ip); if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); @@ -346,7 +395,7 @@ static u64 mce_rdmsrl(u32 msr) { u64 v; - if (__get_cpu_var(injectm).finished) { + if (__this_cpu_read(injectm.finished)) { int offset = msr_to_offset(msr); if (offset < 0) @@ -369,7 +418,7 @@ static u64 mce_rdmsrl(u32 msr) static void mce_wrmsrl(u32 msr, u64 v) { - if (__get_cpu_var(injectm).finished) { + if (__this_cpu_read(injectm.finished)) { int offset = msr_to_offset(msr); if (offset >= 0) @@ -380,6 +429,39 @@ static void mce_wrmsrl(u32 msr, u64 v) } /* + * Collect all global (w.r.t. this processor) status about this machine + * check into our "mce" struct so that we can use it later to assess + * the severity of the problem as we read per-bank specific details. + */ +static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) +{ + mce_setup(m); + + m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (regs) { + /* + * Get the address of the instruction at the time of + * the machine check error. + */ + if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { + m->ip = regs->ip; + m->cs = regs->cs; + + /* + * When in VM86 mode make the cs look like ring 3 + * always. This is a lie, but it's better than passing + * the additional vm86 bit around everywhere. + */ + if (v8086_mode(regs)) + m->cs |= 3; + } + /* Use accurate RIP reporting if available. */ + if (mca_cfg.rip_msr) + m->ip = mce_rdmsrl(mca_cfg.rip_msr); + } +} + +/* * Simple lockless ring to communicate PFNs from the exception handler with the * process context work function. This is vastly simplified because there's * only a single reader and a single writer. @@ -436,54 +518,24 @@ static int mce_ring_add(unsigned long pfn) int mce_available(struct cpuinfo_x86 *c) { - if (mce_disabled) + if (mca_cfg.disabled) return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } static void mce_schedule_work(void) { - if (!mce_ring_empty()) { - struct work_struct *work = &__get_cpu_var(mce_work); - if (!work_pending(work)) - schedule_work(work); - } + if (!mce_ring_empty()) + schedule_work(&__get_cpu_var(mce_work)); } -/* - * Get the address of the instruction at the time of the machine check - * error. - */ -static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) -{ - - if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { - m->ip = regs->ip; - m->cs = regs->cs; - } else { - m->ip = 0; - m->cs = 0; - } - if (rip_msr) - m->ip = mce_rdmsrl(rip_msr); -} +DEFINE_PER_CPU(struct irq_work, mce_irq_work); -#ifdef CONFIG_X86_LOCAL_APIC -/* - * Called after interrupts have been reenabled again - * when a MCE happened during an interrupts off region - * in the kernel. - */ -asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) +static void mce_irq_work_cb(struct irq_work *entry) { - ack_APIC_irq(); - exit_idle(); - irq_enter(); mce_notify_irq(); mce_schedule_work(); - irq_exit(); } -#endif static void mce_report_event(struct pt_regs *regs) { @@ -499,29 +551,28 @@ static void mce_report_event(struct pt_regs *regs) return; } -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Without APIC do not notify. The event will be picked - * up eventually. - */ - if (!cpu_has_apic) - return; + irq_work_queue(&__get_cpu_var(mce_irq_work)); +} - /* - * When interrupts are disabled we cannot use - * kernel services safely. Trigger an self interrupt - * through the APIC to instead do the notification - * after interrupts are reenabled again. - */ - apic->send_IPI_self(MCE_SELF_VECTOR); +/* + * Read ADDR and MISC registers. + */ +static void mce_read_aux(struct mce *m, int i) +{ + if (m->status & MCI_STATUS_MISCV) + m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); + if (m->status & MCI_STATUS_ADDRV) { + m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); - /* - * Wait for idle afterwards again so that we don't leave the - * APIC in a non idle state because the normal APIC writes - * cannot exclude us. - */ - apic_wait_icr_idle(); -#endif + /* + * Mask the reported address by the reported granularity. + */ + if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { + u8 shift = MCI_MISC_ADDR_LSB(m->misc); + m->addr >>= shift; + m->addr <<= shift; + } + } } DEFINE_PER_CPU(unsigned, mce_poll_count); @@ -546,12 +597,11 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; - percpu_inc(mce_poll_count); + this_cpu_inc(mce_poll_count); - mce_setup(&m); + mce_gather_info(&m, NULL); - m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -565,6 +615,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(m.status & MCI_STATUS_VAL)) continue; + this_cpu_write(mce_polled_error, 1); /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -572,13 +623,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * TBD do the same check for MCI_STATUS_EN here? */ if (!(flags & MCP_UC) && - (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) + (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -586,11 +634,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { + if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) mce_log(&m); - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); - add_taint(TAINT_MACHINE_CHECK); - } /* * Clear state for this bank. @@ -611,16 +656,22 @@ EXPORT_SYMBOL_GPL(machine_check_poll); * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static int mce_no_way_out(struct mce *m, char **msg) +static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, + struct pt_regs *regs) { - int i; + int i, ret = 0; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); - if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) - return 1; + if (m->status & MCI_STATUS_VAL) { + __set_bit(i, validp); + if (quirk_no_way_out) + quirk_no_way_out(i, m, regs); + } + if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) + ret = 1; } - return 0; + return ret; } /* @@ -648,11 +699,10 @@ static int mce_timed_out(u64 *t) rmb(); if (atomic_read(&mce_paniced)) wait_for_panic(); - if (!monarch_timeout) + if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - /* CHECKME: Make panic default for 1 too? */ - if (tolerant < 1) + if (mca_cfg.tolerant <= 1) mce_panic("Timeout synchronizing machine check over CPUs", NULL, NULL); cpu_missing = 1; @@ -702,7 +752,8 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, + int severity = mce_severity(&per_cpu(mces_seen, cpu), + mca_cfg.tolerant, &nmsg); if (severity > global_worst) { msg = nmsg; @@ -716,7 +767,7 @@ static void mce_reign(void) * This dumps all the mces in the log buffer and stops the * other CPUs. */ - if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) mce_panic("Fatal Machine check", m, msg); /* @@ -729,7 +780,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) mce_panic("Machine check from unknown source", NULL, NULL); /* @@ -753,7 +804,7 @@ static int mce_start(int *no_way_out) { int order; int cpus = num_online_cpus(); - u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; if (!timeout) return -1; @@ -817,7 +868,7 @@ static int mce_start(int *no_way_out) static int mce_end(int order) { int ret = -1; - u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; if (!timeout) goto reset; @@ -881,15 +932,15 @@ reset: * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses upto page granuality for now. + * parser). So only support physical addresses up to page granuality for now. */ static int mce_usable_address(struct mce *m) { if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) return 0; - if ((m->misc & 0x3f) > PAGE_SHIFT) + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) return 0; - if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) return 0; return 1; } @@ -898,13 +949,58 @@ static void mce_clear_state(unsigned long *toclear) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { if (test_bit(i, toclear)) mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } /* + * Need to save faulting physical address associated with a process + * in the machine check handler some place where we can grab it back + * later in mce_notify_process() + */ +#define MCE_INFO_MAX 16 + +struct mce_info { + atomic_t inuse; + struct task_struct *t; + __u64 paddr; + int restartable; +} mce_info[MCE_INFO_MAX]; + +static void mce_save_info(__u64 addr, int c) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { + if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { + mi->t = current; + mi->paddr = addr; + mi->restartable = c; + return; + } + } + + mce_panic("Too many concurrent recoverable errors", NULL, NULL); +} + +static struct mce_info *mce_find_info(void) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) + if (atomic_read(&mi->inuse) && mi->t == current) + return mi; + return NULL; +} + +static void mce_clear_info(struct mce_info *mi) +{ + atomic_set(&mi->inuse, 0); +} + +/* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. * @@ -918,6 +1014,7 @@ static void mce_clear_state(unsigned long *toclear) */ void do_machine_check(struct pt_regs *regs, long error_code) { + struct mca_config *cfg = &mca_cfg; struct mce m, *final; int i; int worst = 0; @@ -929,7 +1026,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) int order; /* * If no_way_out gets set, there is no safe way to recover from this - * MCE. If tolerant is cranked up, we'll try anyway. + * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ int no_way_out = 0; /* @@ -938,30 +1035,28 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ int kill_it = 0; DECLARE_BITMAP(toclear, MAX_NR_BANKS); + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - atomic_inc(&mce_entry); + this_cpu_inc(mce_exception_count); - percpu_inc(mce_exception_count); - - if (notify_die(DIE_NMI, "machine check", regs, error_code, - 18, SIGKILL) == NOTIFY_STOP) - goto out; - if (!banks) + if (!cfg->banks) goto out; - mce_setup(&m); + mce_gather_info(&m, regs); - m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); final = &__get_cpu_var(mces_seen); *final = m; - no_way_out = mce_no_way_out(&m, &msg); + memset(valid_banks, 0, sizeof(valid_banks)); + no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); barrier(); /* - * When no restart IP must always kill or panic. + * When no restart IP might need to kill or panic. + * Assume the worst for now, but if we find the + * severity is MCE_AR_SEVERITY we have other options. */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; @@ -972,8 +1067,10 @@ void do_machine_check(struct pt_regs *regs, long error_code) * because the first one to see it will clear it. */ order = mce_start(&no_way_out); - for (i = 0; i < banks; i++) { + for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; if (!mce_banks[i].ctl) continue; @@ -989,16 +1086,16 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Non uncorrected or non signaled errors are handled by * machine_check_poll. Leave them alone, unless this panics. */ - if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && !no_way_out) continue; /* * Set taint even when machine check was not enabled. */ - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(&m, tolerant, NULL); + severity = mce_severity(&m, cfg->tolerant, NULL); /* * When machine check was for corrected handler don't touch, @@ -1015,28 +1112,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) continue; } - /* - * Kill on action required. - */ - if (severity == MCE_AR_SEVERITY) - kill_it = 1; - - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); /* * Action optional error. Queue address for later processing. * When the ring overflows we just ignore the AO error. * RED-PEN add some logging mechanism when * usable_address or mce_add_ring fails. - * RED-PEN don't ignore overflow for tolerant == 0 + * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 */ if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) mce_ring_add(m.addr >> PAGE_SHIFT); - mce_get_rip(&m, regs); mce_log(&m); if (severity > worst) { @@ -1045,6 +1132,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) } } + /* mce_clear_state will clear *final, save locally for use later */ + m = *final; + if (!no_way_out) mce_clear_state(toclear); @@ -1056,65 +1146,91 @@ void do_machine_check(struct pt_regs *regs, long error_code) no_way_out = worst >= MCE_PANIC_SEVERITY; /* - * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. - * - * This is mainly used in the case when the system doesn't - * support MCE broadcasting or it has been disabled. + * At insane "tolerant" levels we take no action. Otherwise + * we only die if we have no other choice. For less serious + * issues we try to recover, or limit damage to the current + * process. */ - if (no_way_out && tolerant < 3) - mce_panic("Fatal machine check on current CPU", final, msg); - - /* - * If the error seems to be unrecoverable, something should be - * done. Try to kill as little as possible. If we can kill just - * one task, do that. If the user has set the tolerance very - * high, don't try to do anything at all. - */ - - if (kill_it && tolerant < 3) - force_sig(SIGBUS, current); - - /* notify userspace ASAP */ - set_thread_flag(TIF_MCE_NOTIFY); + if (cfg->tolerant < 3) { + if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); + if (worst == MCE_AR_SEVERITY) { + /* schedule action before return to userland */ + mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); + set_thread_flag(TIF_MCE_NOTIFY); + } else if (kill_it) { + force_sig(SIGBUS, current); + } + } if (worst > 0) mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out: - atomic_dec(&mce_entry); sync_core(); } EXPORT_SYMBOL_GPL(do_machine_check); -/* dummy to break dependency. actual code is in mm/memory-failure.c */ -void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +#ifndef CONFIG_MEMORY_FAILURE +int memory_failure(unsigned long pfn, int vector, int flags) { - printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); + /* mce_severity() should not hand us an ACTION_REQUIRED error */ + BUG_ON(flags & MF_ACTION_REQUIRED); + pr_err("Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", + pfn); + + return 0; } +#endif /* - * Called after mce notification in process context. This code - * is allowed to sleep. Call the high level VM handler to process - * any corrupted pages. - * Assume that the work queue code only calls this one at a time - * per CPU. - * Note we don't disable preemption, so this code might run on the wrong - * CPU. In this case the event is picked up by the scheduled work queue. - * This is merely a fast path to expedite processing in some common - * cases. + * Called in process context that interrupted by MCE and marked with + * TIF_MCE_NOTIFY, just before returning to erroneous userland. + * This code is allowed to sleep. + * Attempt possible recovery such as calling the high level VM handler to + * process any corrupted pages, and kill/signal current process if required. + * Action required errors are handled here. */ void mce_notify_process(void) { unsigned long pfn; - mce_notify_irq(); - while (mce_ring_get(&pfn)) - memory_failure(pfn, MCE_VECTOR); + struct mce_info *mi = mce_find_info(); + int flags = MF_ACTION_REQUIRED; + + if (!mi) + mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); + pfn = mi->paddr >> PAGE_SHIFT; + + clear_thread_flag(TIF_MCE_NOTIFY); + + pr_err("Uncorrected hardware memory error in user-access at %llx", + mi->paddr); + /* + * We must call memory_failure() here even if the current process is + * doomed. We still need to mark the page as poisoned and alert any + * other users of the page. + */ + if (!mi->restartable) + flags |= MF_MUST_KILL; + if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { + pr_err("Memory error not recovered"); + force_sig(SIGBUS, current); + } + mce_clear_info(mi); } +/* + * Action optional processing happens here (picking up + * from the list of faulting pages that do_machine_check() + * placed into the "ring"). + */ static void mce_process_work(struct work_struct *dummy) { - mce_notify_process(); + unsigned long pfn; + + while (mce_ring_get(&pfn)) + memory_failure(pfn, MCE_VECTOR, 0); } #ifdef CONFIG_X86_MCE_INTEL @@ -1147,35 +1263,88 @@ void mce_log_therm_throt_event(__u64 status) * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ -static int check_interval = 5 * 60; /* 5 minutes */ +static unsigned long check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mce_start_timer(unsigned long data) +static unsigned long mce_adjust_timer_default(unsigned long interval) { - struct timer_list *t = &per_cpu(mce_timer, data); - int *n; + return interval; +} + +static unsigned long (*mce_adjust_timer)(unsigned long interval) = + mce_adjust_timer_default; + +static int cmc_error_seen(void) +{ + unsigned long *v = &__get_cpu_var(mce_polled_error); + + return test_and_clear_bit(0, v); +} + +static void mce_timer_fn(unsigned long data) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long iv; + int notify; WARN_ON(smp_processor_id() != data); - if (mce_available(¤t_cpu_data)) { + if (mce_available(__this_cpu_ptr(&cpu_info))) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); + mce_intel_cmci_poll(); } /* * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(mce_next_interval); - if (mce_notify_irq()) - *n = max(*n/2, HZ/100); - else - *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + iv = __this_cpu_read(mce_next_interval); + notify = mce_notify_irq(); + notify |= cmc_error_seen(); + if (notify) { + iv = max(iv / 2, (unsigned long) HZ/100); + } else { + iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + iv = mce_adjust_timer(iv); + } + __this_cpu_write(mce_next_interval, iv); + /* Might have become 0 after CMCI storm subsided */ + if (iv) { + t->expires = jiffies + iv; + add_timer_on(t, smp_processor_id()); + } +} + +/* + * Ensure that the timer is firing in @interval from now. + */ +void mce_timer_kick(unsigned long interval) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long when = jiffies + interval; + unsigned long iv = __this_cpu_read(mce_next_interval); - t->expires = jiffies + *n; - add_timer_on(t, smp_processor_id()); + if (timer_pending(t)) { + if (time_before(when, t->expires)) + mod_timer_pinned(t, when); + } else { + t->expires = round_jiffies(when); + add_timer_on(t, smp_processor_id()); + } + if (interval < iv) + __this_cpu_write(mce_next_interval, interval); +} + +/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +static void mce_timer_delete_all(void) +{ + int cpu; + + for_each_online_cpu(cpu) + del_timer_sync(&per_cpu(mce_timer, cpu)); } static void mce_do_trigger(struct work_struct *work) @@ -1195,17 +1364,11 @@ int mce_notify_irq(void) /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - clear_thread_flag(TIF_MCE_NOTIFY); - if (test_and_clear_bit(0, &mce_need_notify)) { - wake_up_interruptible(&mce_wait); + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); - /* - * There is no risk of missing notifications because - * work_pending is always cleared before the function is - * executed. - */ - if (mce_helper[0] && !work_pending(&mce_trigger_work)) + if (mce_helper[0]) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) @@ -1217,14 +1380,16 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); -static int __cpuinit __mcheck_cpu_mce_banks_init(void) +static int __mcheck_cpu_mce_banks_init(void) { int i; + u8 num_banks = mca_cfg.banks; - mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); if (!mce_banks) return -ENOMEM; - for (i = 0; i < banks; i++) { + + for (i = 0; i < num_banks; i++) { struct mce_bank *b = &mce_banks[i]; b->ctl = -1ULL; @@ -1236,7 +1401,7 @@ static int __cpuinit __mcheck_cpu_mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int __cpuinit __mcheck_cpu_cap_init(void) +static int __mcheck_cpu_cap_init(void) { unsigned b; u64 cap; @@ -1244,19 +1409,19 @@ static int __cpuinit __mcheck_cpu_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - if (!banks) - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + if (!mca_cfg.banks) + pr_info("CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { - printk(KERN_WARNING - "MCE: Using only %u machine check banks out of %u\n", + pr_warn("Using only %u machine check banks out of %u\n", MAX_NR_BANKS, b); b = MAX_NR_BANKS; } /* Don't support asymmetric configurations today */ - WARN_ON(banks != 0 && b != banks); - banks = b; + WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); + mca_cfg.banks = b; + if (!mce_banks) { int err = __mcheck_cpu_mce_banks_init(); @@ -1266,25 +1431,29 @@ static int __cpuinit __mcheck_cpu_cap_init(void) /* Use accurate RIP reporting if available. */ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - rip_msr = MSR_IA32_MCG_EIP; + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; if (cap & MCG_SER_P) - mce_ser = 1; + mca_cfg.ser = true; return 0; } static void __mcheck_cpu_init_generic(void) { + enum mcp_flags m_fl = 0; mce_banks_t all_banks; u64 cap; int i; + if (!mca_cfg.bootlog) + m_fl = MCP_DONTLOG; + /* * Log the machine checks left over from the previous reset. */ bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); + machine_check_poll(MCP_UC | m_fl, &all_banks); set_in_cr4(X86_CR4_MCE); @@ -1292,7 +1461,7 @@ static void __mcheck_cpu_init_generic(void) if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (!b->init) @@ -1302,17 +1471,47 @@ static void __mcheck_cpu_init_generic(void) } } +/* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + /* Add per CPU specific workarounds here */ -static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { + struct mca_config *cfg = &mca_cfg; + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + pr_info("unknown CPU type - not enabling MCE support\n"); return -EOPNOTSUPP; } /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) { + if (c->x86 == 15 && cfg->banks > 4) { /* * disable GART TBL walk error reporting, which * trips off incorrectly with the IOMMU & 3ware @@ -1320,19 +1519,56 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } - if (c->x86 <= 17 && mce_bootlog < 0) { + if (c->x86 <= 17 && cfg->bootlog < 0) { /* * Lots of broken BIOS around that don't clear them * by default and leave crap in there. Don't log: */ - mce_bootlog = 0; + cfg->bootlog = 0; } /* * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6 && banks > 0) + if (c->x86 == 6 && cfg->banks > 0) mce_banks[0].ctl = 0; + + /* + * Turn off MC4_MISC thresholding banks on those models since + * they're not supported there. + */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + int i; + u64 val, hwcr; + bool need_toggle; + u32 msrs[] = { + 0x00000413, /* MC4_MISC0 */ + 0xc0000408, /* MC4_MISC1 */ + }; + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + for (i = 0; i < ARRAY_SIZE(msrs); i++) { + rdmsrl(msrs[i], val); + + /* CntP bit set? */ + if (val & BIT_64(62)) { + val &= ~BIT_64(62); + wrmsrl(msrs[i], val); + } + } + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); + } } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1345,7 +1581,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) mce_banks[0].init = 0; /* @@ -1353,36 +1589,44 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * synchronization with a one second timeout. */ if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - monarch_timeout < 0) - monarch_timeout = USEC_PER_SEC; + cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; /* * There are also broken BIOSes on some Pentium M and * earlier systems: */ - if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) - mce_bootlog = 0; + if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) + cfg->bootlog = 0; + + if (c->x86 == 6 && c->x86_model == 45) + quirk_no_way_out = quirk_sandybridge_ifu; } - if (monarch_timeout < 0) - monarch_timeout = 0; - if (mce_bootlog != 0) - mce_panic_timeout = 30; + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = 0; + if (cfg->bootlog != 0) + cfg->panic_timeout = 30; return 0; } -static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) - return; + return 0; + switch (c->x86_vendor) { case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); + return 1; break; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); + return 1; break; } + + return 0; } static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) @@ -1390,6 +1634,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); + mce_adjust_timer = mce_intel_adjust_timer; break; case X86_VENDOR_AMD: mce_amd_feature_init(c); @@ -1399,27 +1644,32 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) } } -static void __mcheck_cpu_init_timer(void) +static void mce_start_timer(unsigned int cpu, struct timer_list *t) { - struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(mce_next_interval); - - setup_timer(t, mce_start_timer, smp_processor_id()); + unsigned long iv = check_interval * HZ; - if (mce_ignore_ce) + if (mca_cfg.ignore_ce || !iv) return; - *n = check_interval * HZ; - if (!*n) - return; - t->expires = round_jiffies(jiffies + *n); - add_timer_on(t, smp_processor_id()); + per_cpu(mce_next_interval, cpu) = iv; + + t->expires = round_jiffies(jiffies + iv); + add_timer_on(t, cpu); +} + +static void __mcheck_cpu_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned int cpu = smp_processor_id(); + + setup_timer(t, mce_timer_fn, cpu); + mce_start_timer(cpu, t); } /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); } @@ -1431,18 +1681,19 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ -void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) +void mcheck_cpu_init(struct cpuinfo_x86 *c) { - if (mce_disabled) + if (mca_cfg.disabled) return; - __mcheck_cpu_ancient_init(c); + if (__mcheck_cpu_ancient_init(c)) + return; if (!mce_available(c)) return; if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { - mce_disabled = 1; + mca_cfg.disabled = true; return; } @@ -1452,44 +1703,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); - + init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); } /* - * Character device to read and clear the MCE log. + * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. */ -static DEFINE_SPINLOCK(mce_state_lock); -static int open_count; /* #times opened */ -static int open_exclu; /* already open exclusive? */ +static DEFINE_SPINLOCK(mce_chrdev_state_lock); +static int mce_chrdev_open_count; /* #times opened */ +static int mce_chrdev_open_exclu; /* already open exclusive? */ -static int mce_open(struct inode *inode, struct file *file) +static int mce_chrdev_open(struct inode *inode, struct file *file) { - spin_lock(&mce_state_lock); + spin_lock(&mce_chrdev_state_lock); - if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_state_lock); + if (mce_chrdev_open_exclu || + (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_chrdev_state_lock); return -EBUSY; } if (file->f_flags & O_EXCL) - open_exclu = 1; - open_count++; + mce_chrdev_open_exclu = 1; + mce_chrdev_open_count++; - spin_unlock(&mce_state_lock); + spin_unlock(&mce_chrdev_state_lock); return nonseekable_open(inode, file); } -static int mce_release(struct inode *inode, struct file *file) +static int mce_chrdev_release(struct inode *inode, struct file *file) { - spin_lock(&mce_state_lock); + spin_lock(&mce_chrdev_state_lock); - open_count--; - open_exclu = 0; + mce_chrdev_open_count--; + mce_chrdev_open_exclu = 0; - spin_unlock(&mce_state_lock); + spin_unlock(&mce_chrdev_state_lock); return 0; } @@ -1517,6 +1769,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize) /* Error or no more MCE record */ if (rc <= 0) { mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; return rc; } rc = -EFAULT; @@ -1538,8 +1796,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize) return 0; } -static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, - loff_t *off) +static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) { char __user *buf = ubuf; unsigned long *cpu_tsc; @@ -1550,7 +1808,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, if (!cpu_tsc) return -ENOMEM; - mutex_lock(&mce_read_mutex); + mutex_lock(&mce_chrdev_read_mutex); if (!mce_apei_read_done) { err = __mce_read_apei(&buf, usize); @@ -1570,19 +1828,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, do { for (i = prev; i < next; i++) { unsigned long start = jiffies; + struct mce *m = &mcelog.entry[i]; - while (!mcelog.entry[i].finished) { + while (!m->finished) { if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i, 0, - sizeof(struct mce)); + memset(m, 0, sizeof(*m)); goto timeout; } cpu_relax(); } smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, - sizeof(struct mce)); - buf += sizeof(struct mce); + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); timeout: ; } @@ -1602,13 +1859,13 @@ timeout: on_each_cpu(collect_tscs, cpu_tsc, 1); for (i = next; i < MCE_LOG_LEN; i++) { - if (mcelog.entry[i].finished && - mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { - err |= copy_to_user(buf, mcelog.entry+i, - sizeof(struct mce)); + struct mce *m = &mcelog.entry[i]; + + if (m->finished && m->tsc < cpu_tsc[m->cpu]) { + err |= copy_to_user(buf, m, sizeof(*m)); smp_rmb(); - buf += sizeof(struct mce); - memset(&mcelog.entry[i], 0, sizeof(struct mce)); + buf += sizeof(*m); + memset(m, 0, sizeof(*m)); } } @@ -1616,23 +1873,24 @@ timeout: err = -EFAULT; out: - mutex_unlock(&mce_read_mutex); + mutex_unlock(&mce_chrdev_read_mutex); kfree(cpu_tsc); return err ? err : buf - ubuf; } -static unsigned int mce_poll(struct file *file, poll_table *wait) +static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) { - poll_wait(file, &mce_wait, wait); - if (rcu_dereference_check_mce(mcelog.next)) + poll_wait(file, &mce_chrdev_wait, wait); + if (rcu_access_index(mcelog.next)) return POLLIN | POLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) return POLLIN | POLLRDNORM; return 0; } -static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) { int __user *p = (int __user *)arg; @@ -1658,23 +1916,61 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) } } -/* Modified in mce-inject.c, so not static or const */ -struct file_operations mce_chrdev_ops = { - .open = mce_open, - .release = mce_release, - .read = mce_read, - .poll = mce_poll, - .unlocked_ioctl = mce_ioctl, - .llseek = no_llseek, +static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off); + +void register_mce_write_callback(ssize_t (*fn)(struct file *filp, + const char __user *ubuf, + size_t usize, loff_t *off)) +{ + mce_write = fn; +} +EXPORT_SYMBOL_GPL(register_mce_write_callback); + +ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + if (mce_write) + return mce_write(filp, ubuf, usize, off); + else + return -EINVAL; +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_chrdev_open, + .release = mce_chrdev_release, + .read = mce_chrdev_read, + .write = mce_chrdev_write, + .poll = mce_chrdev_poll, + .unlocked_ioctl = mce_chrdev_ioctl, + .llseek = no_llseek, }; -EXPORT_SYMBOL_GPL(mce_chrdev_ops); -static struct miscdevice mce_log_device = { +static struct miscdevice mce_chrdev_device = { MISC_MCELOG_MINOR, "mcelog", &mce_chrdev_ops, }; +static void __mce_disable_bank(void *arg) +{ + int bank = *((int *)arg); + __clear_bit(bank, __get_cpu_var(mce_poll_banks)); + cmci_disable_bank(bank); +} + +void mce_disable_bank(int bank) +{ + if (bank >= mca_cfg.banks) { + pr_warn(FW_BUG + "Ignoring request to disable invalid MCA bank %d.\n", + bank); + return; + } + set_bit(bank, mce_banks_ce_disabled); + on_each_cpu(__mce_disable_bank, &bank, 1); +} + /* * mce=off Disables machine check * mce=no_cmci Disables CMCI @@ -1685,9 +1981,12 @@ static struct miscdevice mce_log_device = { * check, or 0 to not wait * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. + * mce=bios_cmci_threshold Don't program the CMCI threshold */ static int __init mcheck_enable(char *str) { + struct mca_config *cfg = &mca_cfg; + if (*str == 0) { enable_p5_mce(); return 1; @@ -1695,24 +1994,25 @@ static int __init mcheck_enable(char *str) if (*str == '=') str++; if (!strcmp(str, "off")) - mce_disabled = 1; + cfg->disabled = true; else if (!strcmp(str, "no_cmci")) - mce_cmci_disabled = 1; + cfg->cmci_disabled = true; else if (!strcmp(str, "dont_log_ce")) - mce_dont_log_ce = 1; + cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) - mce_ignore_ce = 1; + cfg->ignore_ce = true; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) - mce_bootlog = (str[0] == 'b'); + cfg->bootlog = (str[0] == 'b'); + else if (!strcmp(str, "bios_cmci_threshold")) + cfg->bios_cmci_threshold = true; else if (isdigit(str[0])) { - get_option(&str, &tolerant); + get_option(&str, &(cfg->tolerant)); if (*str == ',') { ++str; - get_option(&str, &monarch_timeout); + get_option(&str, &(cfg->monarch_timeout)); } } else { - printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", - str); + pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; } return 1; @@ -1721,15 +2021,13 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { - atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); - mcheck_intel_therm_init(); return 0; } /* - * Sysfs support + * mce_syscore: PM support */ /* @@ -1740,7 +2038,7 @@ static int mce_disable_error_reporting(void) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -1749,14 +2047,14 @@ static int mce_disable_error_reporting(void) return 0; } -static int mce_suspend(struct sys_device *dev, pm_message_t state) +static int mce_syscore_suspend(void) { return mce_disable_error_reporting(); } -static int mce_shutdown(struct sys_device *dev) +static void mce_syscore_shutdown(void) { - return mce_disable_error_reporting(); + mce_disable_error_reporting(); } /* @@ -1764,18 +2062,25 @@ static int mce_shutdown(struct sys_device *dev) * Only one CPU is active at this time, the others get re-added later using * CPU hotplug: */ -static int mce_resume(struct sys_device *dev) +static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); - __mcheck_cpu_init_vendor(¤t_cpu_data); - - return 0; + __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); } +static struct syscore_ops mce_syscore_ops = { + .suspend = mce_syscore_suspend, + .shutdown = mce_syscore_shutdown, + .resume = mce_syscore_resume, +}; + +/* + * mce_device: Sysfs support + */ + static void mce_cpu_restart(void *data) { - del_timer_sync(&__get_cpu_var(mce_timer)); - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); __mcheck_cpu_init_timer(); @@ -1784,22 +2089,21 @@ static void mce_cpu_restart(void *data) /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { + mce_timer_delete_all(); on_each_cpu(mce_cpu_restart, NULL, 1); } /* Toggle features for corrected errors */ -static void mce_disable_ce(void *all) +static void mce_disable_cmci(void *data) { - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; - if (all) - del_timer_sync(&__get_cpu_var(mce_timer)); cmci_clear(); } static void mce_enable_ce(void *all) { - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; cmci_reenable(); cmci_recheck(); @@ -1807,30 +2111,27 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, +static struct bus_type mce_subsys = { .name = "machinecheck", + .dev_name = "machinecheck", }; -DEFINE_PER_CPU(struct sys_device, mce_dev); +DEFINE_PER_CPU(struct device *, mce_device); -__cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) { return container_of(attr, struct mce_bank, attr); } -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t show_bank(struct device *s, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_bank(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1845,14 +2146,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t -show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +show_trigger(struct device *s, struct device_attribute *attr, char *buf) { strcpy(buf, mce_helper); strcat(buf, "\n"); return strlen(mce_helper) + 1; } -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, const char *buf, size_t siz) { char *p; @@ -1867,8 +2168,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return strlen(mce_helper) + !!p; } -static ssize_t set_ignore_ce(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_ignore_ce(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1876,22 +2177,23 @@ static ssize_t set_ignore_ce(struct sys_device *s, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - if (mce_ignore_ce ^ !!new) { + if (mca_cfg.ignore_ce ^ !!new) { if (new) { /* disable ce features */ - on_each_cpu(mce_disable_ce, (void *)1, 1); - mce_ignore_ce = 1; + mce_timer_delete_all(); + on_each_cpu(mce_disable_cmci, NULL, 1); + mca_cfg.ignore_ce = true; } else { /* enable ce features */ - mce_ignore_ce = 0; + mca_cfg.ignore_ce = false; on_each_cpu(mce_enable_ce, (void *)1, 1); } } return size; } -static ssize_t set_cmci_disabled(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_cmci_disabled(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1899,135 +2201,147 @@ static ssize_t set_cmci_disabled(struct sys_device *s, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - if (mce_cmci_disabled ^ !!new) { + if (mca_cfg.cmci_disabled ^ !!new) { if (new) { /* disable cmci */ - on_each_cpu(mce_disable_ce, NULL, 1); - mce_cmci_disabled = 1; + on_each_cpu(mce_disable_cmci, NULL, 1); + mca_cfg.cmci_disabled = true; } else { /* enable cmci */ - mce_cmci_disabled = 0; + mca_cfg.cmci_disabled = false; on_each_cpu(mce_enable_ce, NULL, 1); } } return size; } -static ssize_t store_int_with_restart(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t store_int_with_restart(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { - ssize_t ret = sysdev_store_int(s, attr, buf, size); + ssize_t ret = device_store_int(s, attr, buf, size); mce_restart(); return ret; } -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); +static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); +static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); -static struct sysdev_ext_attribute attr_check_interval = { - _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, - store_int_with_restart), +static struct dev_ext_attribute dev_attr_check_interval = { + __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), &check_interval }; -static struct sysdev_ext_attribute attr_ignore_ce = { - _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), - &mce_ignore_ce +static struct dev_ext_attribute dev_attr_ignore_ce = { + __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), + &mca_cfg.ignore_ce }; -static struct sysdev_ext_attribute attr_cmci_disabled = { - _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), - &mce_cmci_disabled +static struct dev_ext_attribute dev_attr_cmci_disabled = { + __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), + &mca_cfg.cmci_disabled }; -static struct sysdev_attribute *mce_attrs[] = { - &attr_tolerant.attr, - &attr_check_interval.attr, - &attr_trigger, - &attr_monarch_timeout.attr, - &attr_dont_log_ce.attr, - &attr_ignore_ce.attr, - &attr_cmci_disabled.attr, +static struct device_attribute *mce_device_attrs[] = { + &dev_attr_tolerant.attr, + &dev_attr_check_interval.attr, + &dev_attr_trigger, + &dev_attr_monarch_timeout.attr, + &dev_attr_dont_log_ce.attr, + &dev_attr_ignore_ce.attr, + &dev_attr_cmci_disabled.attr, NULL }; -static cpumask_var_t mce_dev_initialized; +static cpumask_var_t mce_device_initialized; + +static void mce_device_release(struct device *dev) +{ + kfree(dev); +} -/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_create_device(unsigned int cpu) +/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +static int mce_device_create(unsigned int cpu) { + struct device *dev; int err; int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); - per_cpu(mce_dev, cpu).id = cpu; - per_cpu(mce_dev, cpu).cls = &mce_sysclass; + dev = kzalloc(sizeof *dev, GFP_KERNEL); + if (!dev) + return -ENOMEM; + dev->id = cpu; + dev->bus = &mce_subsys; + dev->release = &mce_device_release; - err = sysdev_register(&per_cpu(mce_dev, cpu)); - if (err) + err = device_register(dev); + if (err) { + put_device(dev); return err; + } - for (i = 0; mce_attrs[i]; i++) { - err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) { + err = device_create_file(dev, mce_device_attrs[i]); if (err) goto error; } - for (j = 0; j < banks; j++) { - err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &mce_banks[j].attr); + for (j = 0; j < mca_cfg.banks; j++) { + err = device_create_file(dev, &mce_banks[j].attr); if (err) goto error2; } - cpumask_set_cpu(cpu, mce_dev_initialized); + cpumask_set_cpu(cpu, mce_device_initialized); + per_cpu(mce_device, cpu) = dev; return 0; error2: while (--j >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); + device_remove_file(dev, &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + device_remove_file(dev, mce_device_attrs[i]); - sysdev_unregister(&per_cpu(mce_dev, cpu)); + device_unregister(dev); return err; } -static __cpuinit void mce_remove_device(unsigned int cpu) +static void mce_device_remove(unsigned int cpu) { + struct device *dev = per_cpu(mce_device, cpu); int i; - if (!cpumask_test_cpu(cpu, mce_dev_initialized)) + if (!cpumask_test_cpu(cpu, mce_device_initialized)) return; - for (i = 0; mce_attrs[i]; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) + device_remove_file(dev, mce_device_attrs[i]); - for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); + for (i = 0; i < mca_cfg.banks; i++) + device_remove_file(dev, &mce_banks[i].attr); - sysdev_unregister(&per_cpu(mce_dev, cpu)); - cpumask_clear_cpu(cpu, mce_dev_initialized); + device_unregister(dev); + cpumask_clear_cpu(cpu, mce_device_initialized); + per_cpu(mce_device, cpu) = NULL; } /* Make sure there are no machine checks on offlined CPUs. */ -static void __cpuinit mce_disable_cpu(void *h) +static void mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2035,17 +2349,17 @@ static void __cpuinit mce_disable_cpu(void *h) } } -static void __cpuinit mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2054,48 +2368,43 @@ static void __cpuinit mce_reenable_cpu(void *h) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit +static int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - mce_create_device(cpu); + mce_device_create(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; case CPU_DEAD: - case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); - mce_remove_device(cpu); + mce_device_remove(cpu); + mce_intel_hcpu_update(cpu); break; case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - del_timer_sync(t); smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + del_timer_sync(t); break; case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - if (!mce_ignore_ce && check_interval) { - t->expires = round_jiffies(jiffies + - __get_cpu_var(mce_next_interval)); - add_timer_on(t, cpu); - } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + mce_start_timer(cpu, t); break; - case CPU_POST_DEAD: + } + + if (action == CPU_POST_DEAD) { /* intentionally ignoring frozen here */ - cmci_rediscover(cpu); - break; + cmci_rediscover(); } + return NOTIFY_OK; } -static struct notifier_block mce_cpu_notifier __cpuinitdata = { +static struct notifier_block mce_cpu_notifier = { .notifier_call = mce_cpu_callback, }; @@ -2103,9 +2412,9 @@ static __init void mce_init_banks(void) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; - struct sysdev_attribute *a = &b->attr; + struct device_attribute *a = &b->attr; sysfs_attr_init(&a->attr); a->attr.name = b->attrname; @@ -2122,37 +2431,78 @@ static __init int mcheck_init_device(void) int err; int i = 0; - if (!mce_available(&boot_cpu_data)) - return -EIO; + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } - zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } mce_init_banks(); - err = sysdev_class_register(&mce_sysclass); + err = subsys_system_register(&mce_subsys, NULL); if (err) - return err; + goto err_out_mem; + cpu_notifier_register_begin(); for_each_online_cpu(i) { - err = mce_create_device(i); - if (err) - return err; + err = mce_device_create(i); + if (err) { + /* + * Register notifier anyway (and do not unreg it) so + * that we don't leave undeleted timers, see notifier + * callback above. + */ + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + goto err_device_create; + } } - register_hotcpu_notifier(&mce_cpu_notifier); - misc_register(&mce_log_device); + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + + register_syscore_ops(&mce_syscore_ops); + + /* register character device /dev/mcelog */ + err = misc_register(&mce_chrdev_device); + if (err) + goto err_register; + + return 0; + +err_register: + unregister_syscore_ops(&mce_syscore_ops); + +err_device_create: + /* + * We didn't keep track of which devices were created above, but + * even if we had, the set of online cpus might have changed. + * Play safe and remove for every possible cpu, since + * mce_device_remove() will do the right thing. + */ + for_each_possible_cpu(i) + mce_device_remove(i); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); return err; } - -device_initcall(mcheck_init_device); +device_initcall_sync(mcheck_init_device); /* * Old style boot options parsing. Only for compatibility. */ static int __init mcheck_disable(char *str) { - mce_disabled = 1; + mca_cfg.disabled = true; return 1; } __setup("nomce", mcheck_disable); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 80c482382d5..603df4f7464 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,15 +1,17 @@ /* - * (c) 2005, 2006 Advanced Micro Devices, Inc. + * (c) 2005-2012 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html * * Written by Jacob Shin - AMD, Inc. * - * Support : jacob.shin@amd.com + * Maintained by: Borislav Petkov <bp@alien8.de> * * April 2006 * - added support for AMD Family 0x10 processors + * May 2012 + * - major scrubbing * * All MC4_MISCi registers are shared between multi-cores */ @@ -17,7 +19,6 @@ #include <linux/notifier.h> #include <linux/kobject.h> #include <linux/percpu.h> -#include <linux/sysdev.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/sysfs.h> @@ -26,14 +27,12 @@ #include <linux/cpu.h> #include <linux/smp.h> +#include <asm/amd_nb.h> #include <asm/apic.h> #include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> -#define PFX "mce_threshold: " -#define VERSION "version 1.1.1" -#define NR_BANKS 6 #define NR_BLOCKS 9 #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 @@ -48,36 +47,16 @@ #define MASK_BLKPTR_LO 0xFF000000 #define MCG_XBLK_ADDR 0xC0000400 -struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; +static const char * const th_names[] = { + "load_store", + "insn_fetch", + "combined_unit", + "", + "northbridge", + "execution_unit", }; -/* defaults used early on boot */ -static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, -}; - -struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - cpumask_var_t cpus; -}; -static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); - -#ifdef CONFIG_SMP -static unsigned char shared_bank[NR_BANKS] = { - 0, 0, 0, 0, 1 -}; -#endif - +static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); @@ -89,52 +68,153 @@ static void amd_threshold_interrupt(void); struct thresh_restart { struct threshold_block *b; int reset; + int set_lvt_off; + int lvt_off; u16 old_limit; }; -/* must be called with correct cpu affinity */ -/* Called via smp_call_function_single() */ +static inline bool is_shared_bank(int bank) +{ + /* Bank 4 is for northbridge reporting and is thus shared */ + return (bank == 4); +} + +static const char * const bank4_names(struct threshold_block *b) +{ + switch (b->address) { + /* MSR4_MISC0 */ + case 0x00000413: + return "dram"; + + case 0xc0000408: + return "ht_links"; + + case 0xc0000409: + return "l3_cache"; + + default: + WARN(1, "Funny MSR: 0x%08x\n", b->address); + return ""; + } +}; + + +static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) +{ + /* + * bank 4 supports APIC LVT interrupts implicitly since forever. + */ + if (bank == 4) + return true; + + /* + * IntP: interrupt present; if this bit is set, the thresholding + * bank can generate APIC LVT interrupts + */ + return msr_high_bits & BIT(28); +} + +static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +{ + int msr = (hi & MASK_LVTOFF_HI) >> 20; + + if (apic < 0) { + pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, + b->bank, b->block, b->address, hi, lo); + return 0; + } + + if (apic != msr) { + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", + b->cpu, apic, b->bank, b->block, b->address, hi, lo); + return 0; + } + + return 1; +}; + +/* + * Called via smp_call_function_single(), must be called with correct + * cpu affinity. + */ static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; - u32 mci_misc_hi, mci_misc_lo; + u32 hi, lo; - rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + rdmsr(tr->b->address, lo, hi); - if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) tr->reset = 1; /* limit cannot be lower than err count */ if (tr->reset) { /* reset err count and overflow bit */ - mci_misc_hi = - (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + hi = + (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | (THRESHOLD_MAX - tr->b->threshold_limit); } else if (tr->old_limit) { /* change limit w/o reset */ - int new_count = (mci_misc_hi & THRESHOLD_MAX) + + int new_count = (hi & THRESHOLD_MAX) + (tr->old_limit - tr->b->threshold_limit); - mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | + hi = (hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } - tr->b->interrupt_enable ? - (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : - (mci_misc_hi &= ~MASK_INT_TYPE_HI); + /* clear IntType */ + hi &= ~MASK_INT_TYPE_HI; + + if (!tr->b->interrupt_capable) + goto done; - mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + if (tr->set_lvt_off) { + if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { + /* set new lvt offset */ + hi &= ~MASK_LVTOFF_HI; + hi |= tr->lvt_off << 20; + } + } + + if (tr->b->interrupt_enable) + hi |= INT_TYPE_APIC; + + done: + + hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, lo, hi); +} + +static void mce_threshold_block_init(struct threshold_block *b, int offset) +{ + struct thresh_restart tr = { + .b = b, + .set_lvt_off = 1, + .lvt_off = offset, + }; + + b->threshold_limit = THRESHOLD_MAX; + threshold_restart_bank(&tr); +}; + +static int setup_APIC_mce(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; } /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { + struct threshold_block b; unsigned int cpu = smp_processor_id(); u32 low = 0, high = 0, address = 0; unsigned int bank, block; - struct thresh_restart tr; - int lvt_off = -1; - u8 offset; + int offset = -1; - for (bank = 0; bank < NR_BANKS; ++bank) { + for (bank = 0; bank < mca_cfg.banks; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) address = MSR_IA32_MC0_MISC + bank * 4; @@ -159,43 +239,20 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!block) per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP - if (shared_bank[bank] && c->cpu_core_id) - break; -#endif - offset = (high & MASK_LVTOFF_HI) >> 20; - if (lvt_off < 0) { - if (setup_APIC_eilvt(offset, - THRESHOLD_APIC_VECTOR, - APIC_EILVT_MSG_FIX, 0)) { - pr_err(FW_BUG "cpu %d, failed to " - "setup threshold interrupt " - "for bank %d, block %d " - "(MSR%08X=0x%x%08x)", - smp_processor_id(), bank, block, - address, high, low); - continue; - } - lvt_off = offset; - } else if (lvt_off != offset) { - pr_err(FW_BUG "cpu %d, invalid threshold " - "interrupt offset %d for bank %d," - "block %d (MSR%08X=0x%x%08x)", - smp_processor_id(), lvt_off, bank, - block, address, high, low); - continue; - } - high &= ~MASK_LVTOFF_HI; - high |= lvt_off << 20; - wrmsr(address, low, high); + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = address; + b.interrupt_capable = lvt_interrupt_supported(bank, high); - threshold_defaults.address = address; - tr.b = &threshold_defaults; - tr.reset = 0; - tr.old_limit = 0; - threshold_restart_bank(&tr); + if (b.interrupt_capable) { + int new = (high & MASK_LVTOFF_HI) >> 20; + offset = setup_APIC_mce(offset, new); + } + mce_threshold_block_init(&b, offset); mce_threshold_vector = amd_threshold_interrupt; } } @@ -219,7 +276,7 @@ static void amd_threshold_interrupt(void) mce_setup(&m); /* assume first bank caused it */ - for (bank = 0; bank < NR_BANKS; ++bank) { + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { @@ -282,7 +339,7 @@ struct threshold_attr { #define SHOW_FIELDS(name) \ static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ { \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ + return sprintf(buf, "%lu\n", (unsigned long) b->name); \ } SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) @@ -293,14 +350,16 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) struct thresh_restart tr; unsigned long new; + if (!b->interrupt_capable) + return -EINVAL; + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; b->interrupt_enable = !!new; + memset(&tr, 0, sizeof(tr)); tr.b = b; - tr.reset = 0; - tr.old_limit = 0; smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); @@ -321,48 +380,31 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) if (new < 1) new = 1; + memset(&tr, 0, sizeof(tr)); tr.old_limit = b->threshold_limit; b->threshold_limit = new; tr.b = b; - tr.reset = 0; smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); return size; } -struct threshold_block_cross_cpu { - struct threshold_block *tb; - long retval; -}; - -static void local_error_count_handler(void *_tbcc) -{ - struct threshold_block_cross_cpu *tbcc = _tbcc; - struct threshold_block *b = tbcc->tb; - u32 low, high; - - rdmsr(b->address, low, high); - tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); -} - static ssize_t show_error_count(struct threshold_block *b, char *buf) { - struct threshold_block_cross_cpu tbcc = { .tb = b, }; - - smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); - return sprintf(buf, "%lx\n", tbcc.retval); -} + u32 lo, hi; -static ssize_t store_error_count(struct threshold_block *b, - const char *buf, size_t count) -{ - struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; + rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return 1; + return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - + (THRESHOLD_MAX - b->threshold_limit))); } +static struct threshold_attr error_count = { + .attr = {.name = __stringify(error_count), .mode = 0444 }, + .show = show_error_count, +}; + #define RW_ATTR(val) \ static struct threshold_attr val = { \ .attr = {.name = __stringify(val), .mode = 0644 }, \ @@ -372,13 +414,12 @@ static struct threshold_attr val = { \ RW_ATTR(interrupt_enable); RW_ATTR(threshold_limit); -RW_ATTR(error_count); static struct attribute *default_attrs[] = { - &interrupt_enable.attr, &threshold_limit.attr, &error_count.attr, - NULL + NULL, /* possibly interrupt_enable if supported, see below */ + NULL, }; #define to_block(k) container_of(k, struct threshold_block, kobj) @@ -417,16 +458,14 @@ static struct kobj_type threshold_ktype = { .default_attrs = default_attrs, }; -static __cpuinit int allocate_threshold_blocks(unsigned int cpu, - unsigned int bank, - unsigned int block, - u32 address) +static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, + unsigned int block, u32 address) { struct threshold_block *b = NULL; u32 low, high; int err; - if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) + if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) return 0; if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) @@ -452,8 +491,14 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, b->cpu = cpu; b->address = address; b->interrupt_enable = 0; + b->interrupt_capable = lvt_interrupt_supported(bank, high); b->threshold_limit = THRESHOLD_MAX; + if (b->interrupt_capable) + threshold_ktype.default_attrs[2] = &interrupt_enable.attr; + else + threshold_ktype.default_attrs[2] = NULL; + INIT_LIST_HEAD(&b->miscj); if (per_cpu(threshold_banks, cpu)[bank]->blocks) { @@ -465,7 +510,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, - "misc%i", block); + (bank == 4 ? bank4_names(b) : th_names[bank])); if (err) goto out_free; recurse: @@ -490,131 +535,125 @@ recurse: out_free: if (b) { kobject_put(&b->kobj); + list_del(&b->miscj); kfree(b); } return err; } -static __cpuinit long -local_allocate_threshold_blocks(int cpu, unsigned int bank) +static int __threshold_add_blocks(struct threshold_bank *b) { - return allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); -} + struct list_head *head = &b->blocks->miscj; + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + int err = 0; -/* symlinks sibling shared banks to first core. first core owns dir/files. */ -static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) -{ - int i, err = 0; - struct threshold_bank *b = NULL; - char name[32]; -#ifdef CONFIG_SMP - struct cpuinfo_x86 *c = &cpu_data(cpu); -#endif + err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); + if (err) + return err; - sprintf(name, "threshold_bank%i", bank); + list_for_each_entry_safe(pos, tmp, head, miscj) { -#ifdef CONFIG_SMP - if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(c->llc_shared_map); + err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); + if (err) { + list_for_each_entry_safe_reverse(pos, tmp, head, miscj) + kobject_del(&pos->kobj); - /* first core not up yet */ - if (cpu_data(i).cpu_core_id) - goto out; + return err; + } + } + return err; +} - /* already linked */ - if (per_cpu(threshold_banks, cpu)[bank]) - goto out; +static int threshold_create_bank(unsigned int cpu, unsigned int bank) +{ + struct device *dev = per_cpu(mce_device, cpu); + struct amd_northbridge *nb = NULL; + struct threshold_bank *b = NULL; + const char *name = th_names[bank]; + int err = 0; - b = per_cpu(threshold_banks, i)[bank]; + if (is_shared_bank(bank)) { + nb = node_to_amd_nb(amd_get_nb_id(cpu)); - if (!b) - goto out; + /* threshold descriptor already initialized on this node? */ + if (nb && nb->bank4) { + /* yes, use it */ + b = nb->bank4; + err = kobject_add(b->kobj, &dev->kobj, name); + if (err) + goto out; - err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, - b->kobj, name); - if (err) - goto out; + per_cpu(threshold_banks, cpu)[bank] = b; + atomic_inc(&b->cpus); - cpumask_copy(b->cpus, c->llc_shared_map); - per_cpu(threshold_banks, cpu)[bank] = b; + err = __threshold_add_blocks(b); - goto out; + goto out; + } } -#endif b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; goto out; } - if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) { - kfree(b); - err = -ENOMEM; - goto out; - } - b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); - if (!b->kobj) + b->kobj = kobject_create_and_add(name, &dev->kobj); + if (!b->kobj) { + err = -EINVAL; goto out_free; - -#ifndef CONFIG_SMP - cpumask_setall(b->cpus); -#else - cpumask_set_cpu(cpu, b->cpus); -#endif + } per_cpu(threshold_banks, cpu)[bank] = b; - err = local_allocate_threshold_blocks(cpu, bank); - if (err) - goto out_free; - - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; + if (is_shared_bank(bank)) { + atomic_set(&b->cpus, 1); - err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, - b->kobj, name); - if (err) - goto out; - - per_cpu(threshold_banks, i)[bank] = b; + /* nb is already initialized, see above */ + if (nb) { + WARN_ON(nb->bank4); + nb->bank4 = b; + } } - goto out; + err = allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); + if (!err) + goto out; -out_free: - per_cpu(threshold_banks, cpu)[bank] = NULL; - free_cpumask_var(b->cpus); + out_free: kfree(b); -out: + + out: return err; } /* create dir/files for all valid threshold banks */ -static __cpuinit int threshold_create_device(unsigned int cpu) +static int threshold_create_device(unsigned int cpu) { unsigned int bank; + struct threshold_bank **bp; int err = 0; - for (bank = 0; bank < NR_BANKS; ++bank) { + bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, + GFP_KERNEL); + if (!bp) + return -ENOMEM; + + per_cpu(threshold_banks, cpu) = bp; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; err = threshold_create_bank(cpu, bank); if (err) - goto out; + return err; } -out: + return err; } -/* - * let's be hotplug friendly. - * in case of multiple core processors, the first core always takes ownership - * of shared sysfs dir/files, and rest of the cores will be symlinked to it. - */ - static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { @@ -635,37 +674,42 @@ static void deallocate_threshold_block(unsigned int cpu, per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; } +static void __threshold_remove_blocks(struct threshold_bank *b) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + + kobject_del(b->kobj); + + list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) + kobject_del(&pos->kobj); +} + static void threshold_remove_bank(unsigned int cpu, int bank) { + struct amd_northbridge *nb; struct threshold_bank *b; - char name[32]; - int i = 0; b = per_cpu(threshold_banks, cpu)[bank]; if (!b) return; + if (!b->blocks) goto free_out; - sprintf(name, "threshold_bank%i", bank); - -#ifdef CONFIG_SMP - /* sibling symlink */ - if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); - per_cpu(threshold_banks, cpu)[bank] = NULL; - - return; - } -#endif - - /* remove all sibling symlinks before unregistering */ - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; - - sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); - per_cpu(threshold_banks, i)[bank] = NULL; + if (is_shared_bank(bank)) { + if (!atomic_dec_and_test(&b->cpus)) { + __threshold_remove_blocks(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; + return; + } else { + /* + * the last CPU on this node using the shared bank is + * going away, remove that bank now. + */ + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + nb->bank4 = NULL; + } } deallocate_threshold_block(cpu, bank); @@ -673,7 +717,6 @@ static void threshold_remove_bank(unsigned int cpu, int bank) free_out: kobject_del(b->kobj); kobject_put(b->kobj); - free_cpumask_var(b->cpus); kfree(b); per_cpu(threshold_banks, cpu)[bank] = NULL; } @@ -682,15 +725,16 @@ static void threshold_remove_device(unsigned int cpu) { unsigned int bank; - for (bank = 0; bank < NR_BANKS; ++bank) { + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); } + kfree(per_cpu(threshold_banks, cpu)); } /* get notified when a cpu comes on/off */ -static void __cpuinit +static void amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) { switch (action) { @@ -722,4 +766,24 @@ static __init int threshold_init_device(void) return 0; } -device_initcall(threshold_init_device); +/* + * there are 3 funcs which need to be _initcalled in a logic sequence: + * 1. xen_late_init_mcelog + * 2. mcheck_init_device + * 3. threshold_init_device + * + * xen_late_init_mcelog must register xen_mce_chrdev_device before + * native mce_chrdev_device registration if running under xen platform; + * + * mcheck_init_device should be inited before threshold_init_device to + * initialize mce_device, otherwise a NULL ptr dereference will cause panic. + * + * so we use following _initcalls + * 1. device_initcall(xen_late_init_mcelog); + * 2. device_initcall_sync(mcheck_init_device); + * 3. late_initcall(threshold_init_device); + * + * when running under xen, the initcall order is 1,2,3; + * on baremetal, we skip 1 and we do only 2 and 3. + */ +late_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 6fcd0936194..9a316b21df8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -6,15 +6,17 @@ */ #include <linux/gfp.h> -#include <linux/init.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/sched.h> +#include <linux/cpumask.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> +#include "mce-internal.h" + /* * Support for Intel Correct Machine Check Interrupts. This allows * the CPU to raise an interrupt when a corrected machine check happened. @@ -22,6 +24,18 @@ * Also supports reliable discovery of shared banks. */ +/* + * CMCI can be delivered to multiple cpus that share a machine check bank + * so we need to designate a single cpu to process errors logged in each bank + * in the interrupt handler (otherwise we would have many races and potential + * double reporting of the same error). + * Note that this can change when a cpu is offlined or brought online since + * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear() + * disables CMCI on all banks owned by the cpu and clears this bitfield. At + * this point, cmci_rediscover() kicks in and a different cpu may end up + * taking ownership of some of the shared MCA banks that were previously + * owned by the offlined cpu. + */ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); /* @@ -30,13 +44,28 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); */ static DEFINE_SPINLOCK(cmci_discover_lock); -#define CMCI_THRESHOLD 1 +#define CMCI_THRESHOLD 1 +#define CMCI_POLL_INTERVAL (30 * HZ) +#define CMCI_STORM_INTERVAL (1 * HZ) +#define CMCI_STORM_THRESHOLD 15 + +static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int, cmci_storm_state); + +enum { + CMCI_STORM_NONE, + CMCI_STORM_ACTIVE, + CMCI_STORM_SUBSIDED, +}; + +static atomic_t cmci_storm_on_cpus; static int cmci_supported(int *banks) { u64 cap; - if (mce_cmci_disabled || mce_ignore_ce) + if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) return 0; /* @@ -53,6 +82,109 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } +void mce_intel_cmci_poll(void) +{ + if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) + return; + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); +} + +void mce_intel_hcpu_update(unsigned long cpu) +{ + if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) + atomic_dec(&cmci_storm_on_cpus); + + per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +} + +unsigned long mce_intel_adjust_timer(unsigned long interval) +{ + int r; + + if (interval < CMCI_POLL_INTERVAL) + return interval; + + switch (__this_cpu_read(cmci_storm_state)) { + case CMCI_STORM_ACTIVE: + /* + * We switch back to interrupt mode once the poll timer has + * silenced itself. That means no events recorded and the + * timer interval is back to our poll interval. + */ + __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); + r = atomic_sub_return(1, &cmci_storm_on_cpus); + if (r == 0) + pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + /* FALLTHROUGH */ + + case CMCI_STORM_SUBSIDED: + /* + * We wait for all cpus to go back to SUBSIDED + * state. When that happens we switch back to + * interrupt mode. + */ + if (!atomic_read(&cmci_storm_on_cpus)) { + __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); + cmci_reenable(); + cmci_recheck(); + } + return CMCI_POLL_INTERVAL; + default: + /* + * We have shiny weather. Let the poll do whatever it + * thinks. + */ + return interval; + } +} + +static void cmci_storm_disable_banks(void) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + spin_lock_irqsave(&cmci_discover_lock, flags); + owned = __get_cpu_var(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +static bool cmci_storm_detect(void) +{ + unsigned int cnt = __this_cpu_read(cmci_storm_cnt); + unsigned long ts = __this_cpu_read(cmci_time_stamp); + unsigned long now = jiffies; + int r; + + if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) + return true; + + if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { + cnt++; + } else { + cnt = 1; + __this_cpu_write(cmci_time_stamp, now); + } + __this_cpu_write(cmci_storm_cnt, cnt); + + if (cnt <= CMCI_STORM_THRESHOLD) + return false; + + cmci_storm_disable_banks(); + __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); + r = atomic_add_return(1, &cmci_storm_on_cpus); + mce_timer_kick(CMCI_POLL_INTERVAL); + + if (r == 1) + pr_notice("CMCI storm detected: switching to poll mode\n"); + return true; +} + /* * The interrupt handler. This is called on every event. * Just call the poller directly to log any events. @@ -61,64 +193,86 @@ static int cmci_supported(int *banks) */ static void intel_threshold_interrupt(void) { + if (cmci_storm_detect()) + return; machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); mce_notify_irq(); } -static void print_update(char *type, int *hdr, int num) -{ - if (*hdr == 0) - printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); - *hdr = 1; - printk(KERN_CONT " %s:%d", type, num); -} - /* * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks * on this CPU. Use the algorithm recommended in the SDM to discover shared * banks. */ -static void cmci_discover(int banks, int boot) +static void cmci_discover(int banks) { unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); unsigned long flags; - int hdr = 0; int i; + int bios_wrong_thresh = 0; spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; + int bios_zero_thresh = 0; if (test_bit(i, owned)) continue; + /* Skip banks in firmware first mode */ + if (test_bit(i, mce_banks_ce_disabled)) + continue; + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ if (val & MCI_CTL2_CMCI_EN) { - if (test_and_clear_bit(i, owned) && !boot) - print_update("SHD", &hdr, i); + clear_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; } - val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; + if (!mca_cfg.bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { + /* + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. + */ + bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; + } + + val |= MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & MCI_CTL2_CMCI_EN) { - if (!test_and_set_bit(i, owned) && !boot) - print_update("CMCI", &hdr, i); + set_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + bios_wrong_thresh = 1; } else { WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } spin_unlock_irqrestore(&cmci_discover_lock, flags); - if (hdr) - printk(KERN_CONT "\n"); + if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { + pr_info_once( + "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); + pr_info_once( + "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); + } } /* @@ -130,13 +284,26 @@ void cmci_recheck(void) unsigned long flags; int banks; - if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) + if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) return; local_irq_save(flags); machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); local_irq_restore(flags); } +/* Caller must hold the lock on cmci_discover_lock */ +static void __cmci_disable_bank(int bank) +{ + u64 val; + + if (!test_bit(bank, __get_cpu_var(mce_banks_owned))) + return; + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + __clear_bit(bank, __get_cpu_var(mce_banks_owned)); +} + /* * Disable CMCI on this CPU for all banks it owns when it goes down. * This allows other CPUs to claim the banks on rediscovery. @@ -146,51 +313,33 @@ void cmci_clear(void) unsigned long flags; int i; int banks; - u64 val; if (!cmci_supported(&banks)) return; spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) { - if (!test_bit(i, __get_cpu_var(mce_banks_owned))) - continue; - /* Disable CMCI */ - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MCx_CTL2(i), val); - __clear_bit(i, __get_cpu_var(mce_banks_owned)); - } + for (i = 0; i < banks; i++) + __cmci_disable_bank(i); spin_unlock_irqrestore(&cmci_discover_lock, flags); } -/* - * After a CPU went down cycle through all the others and rediscover - * Must run in process context. - */ -void cmci_rediscover(int dying) +static void cmci_rediscover_work_func(void *arg) +{ + int banks; + + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks); +} + +/* After a CPU went down cycle through all the others and rediscover */ +void cmci_rediscover(void) { int banks; - int cpu; - cpumask_var_t old; if (!cmci_supported(&banks)) return; - if (!alloc_cpumask_var(&old, GFP_KERNEL)) - return; - cpumask_copy(old, ¤t->cpus_allowed); - for_each_online_cpu(cpu) { - if (cpu == dying) - continue; - if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) - continue; - /* Recheck banks in case CPUs don't all have the same */ - if (cmci_supported(&banks)) - cmci_discover(banks, 0); - } - - set_cpus_allowed_ptr(current, old); - free_cpumask_var(old); + on_each_cpu(cmci_rediscover_work_func, NULL, 1); } /* @@ -200,7 +349,20 @@ void cmci_reenable(void) { int banks; if (cmci_supported(&banks)) - cmci_discover(banks, 0); + cmci_discover(banks); +} + +void cmci_disable_bank(int bank) +{ + int banks; + unsigned long flags; + + if (!cmci_supported(&banks)) + return; + + spin_lock_irqsave(&cmci_discover_lock, flags); + __cmci_disable_bank(bank); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void intel_init_cmci(void) @@ -211,7 +373,7 @@ static void intel_init_cmci(void) return; mce_threshold_vector = intel_threshold_interrupt; - cmci_discover(banks, 1); + cmci_discover(banks); /* * For CPU #0 this runs with still disabled APIC, but that's * ok because only the vector is set up. We still do another diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 5c0e6533d9b..a3042989398 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -5,11 +5,9 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> -#include <linux/init.h> #include <linux/smp.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/mce.h> #include <asm/msr.h> @@ -34,7 +32,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) smp_processor_id()); } - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 4b683267eca..36a1bb6d1ee 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -18,18 +18,18 @@ #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/percpu.h> -#include <linux/sysdev.h> +#include <linux/export.h> #include <linux/types.h> #include <linux/init.h> #include <linux/smp.h> #include <linux/cpu.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/apic.h> #include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/trace/irq_vectors.h> /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) @@ -53,8 +53,26 @@ struct thermal_state { struct _thermal_state core_power_limit; struct _thermal_state package_throttle; struct _thermal_state package_power_limit; + struct _thermal_state core_thresh0; + struct _thermal_state core_thresh1; + struct _thermal_state pkg_thresh0; + struct _thermal_state pkg_thresh1; }; +/* Callback to handle core threshold interrupts */ +int (*platform_thermal_notify)(__u64 msr_val); +EXPORT_SYMBOL(platform_thermal_notify); + +/* Callback to handle core package threshold_interrupts */ +int (*platform_thermal_package_notify)(__u64 msr_val); +EXPORT_SYMBOL_GPL(platform_thermal_package_notify); + +/* Callback support of rate control, return true, if + * callback has rate control */ +bool (*platform_thermal_package_rate_control)(void); +EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control); + + static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -62,16 +80,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0); static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, \ - therm_throt_sysdev_show_##_name, \ +#define define_therm_throt_device_one_ro(_name) \ + static DEVICE_ATTR(_name, 0444, \ + therm_throt_device_show_##_name, \ NULL) \ -#define define_therm_throt_sysdev_show_func(event, name) \ +#define define_therm_throt_device_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##event##_##name( \ - struct sys_device *dev, \ - struct sysdev_attribute *attr, \ +static ssize_t therm_throt_device_show_##event##_##name( \ + struct device *dev, \ + struct device_attribute *attr, \ char *buf) \ { \ unsigned int cpu = dev->id; \ @@ -88,20 +106,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \ return ret; \ } -define_therm_throt_sysdev_show_func(core_throttle, count); -define_therm_throt_sysdev_one_ro(core_throttle_count); +define_therm_throt_device_show_func(core_throttle, count); +define_therm_throt_device_one_ro(core_throttle_count); -define_therm_throt_sysdev_show_func(core_power_limit, count); -define_therm_throt_sysdev_one_ro(core_power_limit_count); +define_therm_throt_device_show_func(core_power_limit, count); +define_therm_throt_device_one_ro(core_power_limit_count); -define_therm_throt_sysdev_show_func(package_throttle, count); -define_therm_throt_sysdev_one_ro(package_throttle_count); +define_therm_throt_device_show_func(package_throttle, count); +define_therm_throt_device_one_ro(package_throttle_count); -define_therm_throt_sysdev_show_func(package_power_limit, count); -define_therm_throt_sysdev_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(package_power_limit, count); +define_therm_throt_device_one_ro(package_power_limit_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_core_throttle_count.attr, + &dev_attr_core_throttle_count.attr, NULL }; @@ -176,13 +194,6 @@ static int therm_throt_process(bool new_event, int event, int level) this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); - else - printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package", - state->count); - - add_taint(TAINT_MACHINE_CHECK); return 1; } if (old_event) { @@ -190,86 +201,105 @@ static int therm_throt_process(bool new_event, int event, int level) printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", this_cpu, level == CORE_LEVEL ? "Core" : "Package"); - else - printk(KERN_INFO "CPU%d: %s power limit normal\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package"); return 1; } return 0; } +static int thresh_event_valid(int level, int event) +{ + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); + u64 now = get_jiffies_64(); + + if (level == PACKAGE_LEVEL) + state = (event == 0) ? &pstate->pkg_thresh0 : + &pstate->pkg_thresh1; + else + state = (event == 0) ? &pstate->core_thresh0 : + &pstate->core_thresh1; + + if (time_before64(now, state->next_check)) + return 0; + + state->next_check = now + CHECK_INTERVAL; + + return 1; +} + +static bool int_pln_enable; +static int __init int_pln_enable_setup(char *s) +{ + int_pln_enable = true; + + return 1; +} +__setup("int_pln_enable", int_pln_enable_setup); + #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, - unsigned int cpu) +static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) { int err; struct cpuinfo_x86 *c = &cpu_data(cpu); - err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); + err = sysfs_create_group(&dev->kobj, &thermal_attr_group); if (err) return err; - if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_core_power_limit_count.attr, + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_core_power_limit_count.attr, thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PTS)) { - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_throttle_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_count.attr, thermal_attr_group.name); - if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_power_limit_count.attr, + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_power_limit_count.attr, thermal_attr_group.name); } return err; } -static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) +static void thermal_throttle_remove_dev(struct device *dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); + sysfs_remove_group(&dev->kobj, &thermal_attr_group); } -/* Mutex protecting device creation against CPU hotplug: */ -static DEFINE_MUTEX(therm_cpu_lock); - /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int +static int thermal_throttle_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; int err = 0; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - mutex_lock(&therm_cpu_lock); - err = thermal_throttle_add_dev(sys_dev, cpu); - mutex_unlock(&therm_cpu_lock); + err = thermal_throttle_add_dev(dev, cpu); WARN_ON(err); break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - mutex_lock(&therm_cpu_lock); - thermal_throttle_remove_dev(sys_dev); - mutex_unlock(&therm_cpu_lock); + thermal_throttle_remove_dev(dev); break; } return notifier_from_errno(err); } -static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = +static struct notifier_block thermal_throttle_cpu_notifier = { .notifier_call = thermal_throttle_cpu_callback, }; @@ -282,19 +312,16 @@ static __init int thermal_throttle_init_device(void) if (!atomic_read(&therm_throt_en)) return 0; - register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_begin(); -#ifdef CONFIG_HOTPLUG_CPU - mutex_lock(&therm_cpu_lock); -#endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); + err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); WARN_ON(err); } -#ifdef CONFIG_HOTPLUG_CPU - mutex_unlock(&therm_cpu_lock); -#endif + + __register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_done(); return 0; } @@ -302,49 +329,89 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ -/* - * Set up the most two significant bit to notify mce log that this thermal - * event type. - * This is a temp solution. May be changed in the future with mce log - * infrasture. - */ -#define CORE_THROTTLED (0) -#define CORE_POWER_LIMIT ((__u64)1 << 62) -#define PACKAGE_THROTTLED ((__u64)2 << 62) -#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) +static void notify_package_thresholds(__u64 msr_val) +{ + bool notify_thres_0 = false; + bool notify_thres_1 = false; + + if (!platform_thermal_package_notify) + return; + + /* lower threshold check */ + if (msr_val & THERM_LOG_THRESHOLD0) + notify_thres_0 = true; + /* higher threshold check */ + if (msr_val & THERM_LOG_THRESHOLD1) + notify_thres_1 = true; + + if (!notify_thres_0 && !notify_thres_1) + return; + + if (platform_thermal_package_rate_control && + platform_thermal_package_rate_control()) { + /* Rate control is implemented in callback */ + platform_thermal_package_notify(msr_val); + return; + } + + /* lower threshold reached */ + if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0)) + platform_thermal_package_notify(msr_val); + /* higher threshold reached */ + if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1)) + platform_thermal_package_notify(msr_val); +} + +static void notify_thresholds(__u64 msr_val) +{ + /* check whether the interrupt handler is defined; + * otherwise simply return + */ + if (!platform_thermal_notify) + return; + + /* lower threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD0) && + thresh_event_valid(CORE_LEVEL, 0)) + platform_thermal_notify(msr_val); + /* higher threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD1) && + thresh_event_valid(CORE_LEVEL, 1)) + platform_thermal_notify(msr_val); +} /* Thermal transition interrupt handler */ static void intel_thermal_interrupt(void) { __u64 msr_val; - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + /* Check for violation of core thermal thresholds*/ + notify_thresholds(msr_val); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) - mce_log_therm_throt_event(CORE_THROTTLED | msr_val); + mce_log_therm_throt_event(msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) - if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) + therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - CORE_LEVEL) != 0) - mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); + CORE_LEVEL); - if (cpu_has(c, X86_FEATURE_PTS)) { + if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + /* check violations of package thermal thresholds */ + notify_package_thresholds(msr_val); + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, - PACKAGE_LEVEL) != 0) - mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) - if (therm_throt_process(msr_val & + PACKAGE_LEVEL); + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - PACKAGE_LEVEL) != 0) - mce_log_therm_throt_event(PACKAGE_POWER_LIMIT - | msr_val); + PACKAGE_LEVEL); } } @@ -352,20 +419,30 @@ static void unexpected_thermal_interrupt(void) { printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); } static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +static inline void __smp_thermal_interrupt(void) { - exit_idle(); - irq_enter(); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); - irq_exit(); - /* Ack only at the end to avoid potential reentry */ - ack_APIC_irq(); +} + +asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) +{ + entering_irq(); + __smp_thermal_interrupt(); + exiting_ack_irq(); +} + +asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) +{ + entering_irq(); + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + __smp_thermal_interrupt(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + exiting_ack_irq(); } /* Thermal monitoring depends on APIC, ACPI and clock modulation */ @@ -405,18 +482,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = lvtthmr_init; /* * The initial value of thermal LVT entries on all APs always reads * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI * sequence to them and LVT registers are reset to 0s except for * the mask bits which are set to 1s when APs receive INIT IPI. - * Always restore the value that BIOS has programmed on AP based on - * BSP's info we saved since BIOS is always setting the same value - * for all threads/cores + * If BIOS takes over the thermal interrupt and sets its interrupt + * delivery mode to SMI (not fixed), it restores the value that the + * BIOS has programmed on AP based on BSP's info we saved since BIOS + * is always setting the same value for all threads/cores. */ - apic_write(APIC_LVTTHMR, lvtthmr_init); + if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) + apic_write(APIC_LVTTHMR, lvtthmr_init); - h = lvtthmr_init; if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG @@ -447,9 +526,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN)) + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) + wrmsr(MSR_IA32_THERM_INTERRUPT, + (l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); else wrmsr(MSR_IA32_THERM_INTERRUPT, @@ -457,9 +540,14 @@ void intel_init_thermal(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_PTS)) { rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN)) + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE + (l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE)) + & ~PACKAGE_THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE | PACKAGE_THERM_INT_HIGH_ENABLE | PACKAGE_THERM_INT_PLN_ENABLE), h); else diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index d746df2909c..7245980186e 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -8,6 +8,7 @@ #include <asm/apic.h> #include <asm/idle.h> #include <asm/mce.h> +#include <asm/trace/irq_vectors.h> static void default_threshold_interrupt(void) { @@ -17,13 +18,24 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage void smp_threshold_interrupt(void) +static inline void __smp_threshold_interrupt(void) { - exit_idle(); - irq_enter(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); - irq_exit(); - /* Ack only at the end to avoid potential reentry */ - ack_APIC_irq(); +} + +asmlinkage __visible void smp_threshold_interrupt(void) +{ + entering_irq(); + __smp_threshold_interrupt(); + exiting_ack_irq(); +} + +asmlinkage __visible void smp_trace_threshold_interrupt(void) +{ + entering_irq(); + trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); + __smp_threshold_interrupt(); + trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); + exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 54060f56597..7dc5564d0cd 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -5,10 +5,8 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> -#include <linux/init.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/mce.h> #include <asm/msr.h> @@ -16,7 +14,7 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code) { printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); } /* Set up machine check reporting on the Winchip C6 series */ |
