diff options
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck')
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 79 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce_intel.c | 33 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/therm_throt.c | 22 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mcheck/threshold.c | 4 |
4 files changed, 96 insertions, 42 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4d5419b249d..9a79c8dbd8e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define SPINUNIT 100 /* 100ns */ -atomic_t mce_entry; - DEFINE_PER_CPU(unsigned, mce_exception_count); struct mce_bank *mce_banks __read_mostly; @@ -89,6 +87,9 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +/* CMCI storm detection filter */ +static DEFINE_PER_CPU(unsigned long, mce_polled_error); + /* * MCA banks polled by the period polling timer for corrected events. * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). @@ -614,6 +615,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(m.status & MCI_STATUS_VAL)) continue; + this_cpu_write(mce_polled_error, 1); /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -700,8 +702,7 @@ static int mce_timed_out(u64 *t) if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - /* CHECKME: Make panic default for 1 too? */ - if (mca_cfg.tolerant < 1) + if (mca_cfg.tolerant <= 1) mce_panic("Timeout synchronizing machine check over CPUs", NULL, NULL); cpu_missing = 1; @@ -1037,8 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - atomic_inc(&mce_entry); - this_cpu_inc(mce_exception_count); if (!cfg->banks) @@ -1168,7 +1167,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out: - atomic_dec(&mce_entry); sync_core(); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1278,10 +1276,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; +static int cmc_error_seen(void) +{ + unsigned long *v = &__get_cpu_var(mce_polled_error); + + return test_and_clear_bit(0, v); +} + static void mce_timer_fn(unsigned long data) { struct timer_list *t = &__get_cpu_var(mce_timer); unsigned long iv; + int notify; WARN_ON(smp_processor_id() != data); @@ -1296,7 +1302,9 @@ static void mce_timer_fn(unsigned long data) * polling interval, otherwise increase the polling interval. */ iv = __this_cpu_read(mce_next_interval); - if (mce_notify_irq()) { + notify = mce_notify_irq(); + notify |= cmc_error_seen(); + if (notify) { iv = max(iv / 2, (unsigned long) HZ/100); } else { iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); @@ -2423,28 +2431,67 @@ static __init int mcheck_init_device(void) int err; int i = 0; - if (!mce_available(&boot_cpu_data)) - return -EIO; + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } - zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } mce_init_banks(); err = subsys_system_register(&mce_subsys, NULL); if (err) - return err; + goto err_out_mem; + cpu_notifier_register_begin(); for_each_online_cpu(i) { err = mce_device_create(i); - if (err) - return err; + if (err) { + /* + * Register notifier anyway (and do not unreg it) so + * that we don't leave undeleted timers, see notifier + * callback above. + */ + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + goto err_device_create; + } } + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + register_syscore_ops(&mce_syscore_ops); - register_hotcpu_notifier(&mce_cpu_notifier); /* register character device /dev/mcelog */ - misc_register(&mce_chrdev_device); + err = misc_register(&mce_chrdev_device); + if (err) + goto err_register; + + return 0; + +err_register: + unregister_syscore_ops(&mce_syscore_ops); + +err_device_create: + /* + * We didn't keep track of which devices were created above, but + * even if we had, the set of online cpus might have changed. + * Play safe and remove for every possible cpu, since + * mce_device_remove() will do the right thing. + */ + for_each_possible_cpu(i) + mce_device_remove(i); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index fb6156fee6f..9a316b21df8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -9,6 +9,7 @@ #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/sched.h> +#include <linux/cpumask.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/msr.h> @@ -41,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ -static DEFINE_RAW_SPINLOCK(cmci_discover_lock); +static DEFINE_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 #define CMCI_POLL_INTERVAL (30 * HZ) @@ -137,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval) } } +static void cmci_storm_disable_banks(void) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + spin_lock_irqsave(&cmci_discover_lock, flags); + owned = __get_cpu_var(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + static bool cmci_storm_detect(void) { unsigned int cnt = __this_cpu_read(cmci_storm_cnt); @@ -158,7 +175,7 @@ static bool cmci_storm_detect(void) if (cnt <= CMCI_STORM_THRESHOLD) return false; - cmci_clear(); + cmci_storm_disable_banks(); __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); r = atomic_add_return(1, &cmci_storm_on_cpus); mce_timer_kick(CMCI_POLL_INTERVAL); @@ -194,7 +211,7 @@ static void cmci_discover(int banks) int i; int bios_wrong_thresh = 0; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; int bios_zero_thresh = 0; @@ -249,7 +266,7 @@ static void cmci_discover(int banks) WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { pr_info_once( "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); @@ -299,10 +316,10 @@ void cmci_clear(void) if (!cmci_supported(&banks)) return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) __cmci_disable_bank(i); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void cmci_rediscover_work_func(void *arg) @@ -343,9 +360,9 @@ void cmci_disable_bank(int bank) if (!cmci_supported(&banks)) return; - raw_spin_lock_irqsave(&cmci_discover_lock, flags); + spin_lock_irqsave(&cmci_discover_lock, flags); __cmci_disable_bank(bank); - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void intel_init_cmci(void) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 3eec7de76ef..36a1bb6d1ee 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -271,9 +271,6 @@ static void thermal_throttle_remove_dev(struct device *dev) sysfs_remove_group(&dev->kobj, &thermal_attr_group); } -/* Mutex protecting device creation against CPU hotplug: */ -static DEFINE_MUTEX(therm_cpu_lock); - /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int thermal_throttle_cpu_callback(struct notifier_block *nfb, @@ -289,18 +286,14 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - mutex_lock(&therm_cpu_lock); err = thermal_throttle_add_dev(dev, cpu); - mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - mutex_lock(&therm_cpu_lock); thermal_throttle_remove_dev(dev); - mutex_unlock(&therm_cpu_lock); break; } return notifier_from_errno(err); @@ -319,19 +312,16 @@ static __init int thermal_throttle_init_device(void) if (!atomic_read(&therm_throt_en)) return 0; - register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_begin(); -#ifdef CONFIG_HOTPLUG_CPU - mutex_lock(&therm_cpu_lock); -#endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); WARN_ON(err); } -#ifdef CONFIG_HOTPLUG_CPU - mutex_unlock(&therm_cpu_lock); -#endif + + __register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_done(); return 0; } @@ -439,14 +429,14 @@ static inline void __smp_thermal_interrupt(void) smp_thermal_vector(); } -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) { entering_irq(); __smp_thermal_interrupt(); exiting_ack_irq(); } -asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) { entering_irq(); trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index fe6b1c86645..7245980186e 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -24,14 +24,14 @@ static inline void __smp_threshold_interrupt(void) mce_threshold_vector(); } -asmlinkage void smp_threshold_interrupt(void) +asmlinkage __visible void smp_threshold_interrupt(void) { entering_irq(); __smp_threshold_interrupt(); exiting_ack_irq(); } -asmlinkage void smp_trace_threshold_interrupt(void) +asmlinkage __visible void smp_trace_threshold_interrupt(void) { entering_irq(); trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); |
