diff options
Diffstat (limited to 'arch/ia64/kernel/mca_drv.c')
| -rw-r--r-- | arch/ia64/kernel/mca_drv.c | 266 |
1 files changed, 196 insertions, 70 deletions
diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c index f081c60ab20..94f8bf777af 100644 --- a/arch/ia64/kernel/mca_drv.c +++ b/arch/ia64/kernel/mca_drv.c @@ -3,18 +3,17 @@ * Purpose: Generic MCA handling layer * * Copyright (C) 2004 FUJITSU LIMITED - * Copyright (C) Hidetoshi Seto (seto.hidetoshi@jp.fujitsu.com) + * Copyright (C) 2004 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> * Copyright (C) 2005 Silicon Graphics, Inc * Copyright (C) 2005 Keith Owens <kaos@sgi.com> + * Copyright (C) 2006 Russ Anderson <rja@sgi.com> */ -#include <linux/config.h> #include <linux/types.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/kallsyms.h> -#include <linux/smp_lock.h> #include <linux/bootmem.h> #include <linux/acpi.h> #include <linux/timer.h> @@ -23,12 +22,12 @@ #include <linux/smp.h> #include <linux/workqueue.h> #include <linux/mm.h> +#include <linux/slab.h> #include <asm/delay.h> #include <asm/machvec.h> #include <asm/page.h> #include <asm/ptrace.h> -#include <asm/system.h> #include <asm/sal.h> #include <asm/mca.h> @@ -61,6 +60,11 @@ typedef enum { ISOLATE_NONE } isolate_status_t; +typedef enum { + MCA_NOT_RECOVERED = 0, + MCA_RECOVERED = 1 +} recovery_status_t; + /* * This pool keeps pointers to the section part of SAL error record */ @@ -70,6 +74,34 @@ static struct { int max_idx; /* Maximum index of section pointer list pool */ } slidx_pool; +static int +fatal_mca(const char *fmt, ...) +{ + va_list args; + char buf[256]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + ia64_mca_printk(KERN_ALERT "MCA: %s\n", buf); + + return MCA_NOT_RECOVERED; +} + +static int +mca_recovered(const char *fmt, ...) +{ + va_list args; + char buf[256]; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + ia64_mca_printk(KERN_INFO "MCA: %s\n", buf); + + return MCA_RECOVERED; +} + /** * mca_page_isolate - isolate a poisoned page in order not to use it later * @paddr: poisoned memory location @@ -88,7 +120,7 @@ mca_page_isolate(unsigned long paddr) if (!ia64_phys_addr_valid(paddr)) return ISOLATE_NONE; - if (!pfn_valid(paddr)) + if (!pfn_valid(paddr >> PAGE_SHIFT)) return ISOLATE_NONE; /* convert physical address to physical page number */ @@ -108,6 +140,7 @@ mca_page_isolate(unsigned long paddr) return ISOLATE_NG; /* add attribute 'Reserved' and register the page */ + get_page(p); SetPageReserved(p); page_isolate[num_page_isolate++] = p; @@ -120,10 +153,14 @@ mca_page_isolate(unsigned long paddr) */ void -mca_handler_bh(unsigned long paddr) +mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr) { - printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n", - current->pid, current->comm); + ia64_mlogbuf_dump(); + printk(KERN_ERR "OS_MCA: process [cpu %d, pid: %d, uid: %d, " + "iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n", + raw_smp_processor_id(), current->pid, + from_kuid(&init_user_ns, current_uid()), + iip, ipsr, paddr, current->comm); spin_lock(&mca_bh_lock); switch (mca_page_isolate(paddr)) { @@ -131,7 +168,7 @@ mca_handler_bh(unsigned long paddr) printk(KERN_DEBUG "Page isolation: ( %lx ) success.\n", paddr); break; case ISOLATE_NG: - printk(KERN_DEBUG "Page isolation: ( %lx ) failure.\n", paddr); + printk(KERN_CRIT "Page isolation: ( %lx ) failure.\n", paddr); break; default: break; @@ -312,7 +349,7 @@ init_record_index_pools(void) /* - 3 - */ slidx_pool.max_idx = (rec_max_size/sect_min_size) * 2 + 1; - slidx_pool.buffer = (slidx_list_t *) + slidx_pool.buffer = kmalloc(slidx_pool.max_idx * sizeof(slidx_list_t), GFP_KERNEL); return slidx_pool.buffer ? 0 : -ENOMEM; @@ -398,6 +435,50 @@ is_mca_global(peidx_table_t *peidx, pal_bus_check_info_t *pbci, } /** + * get_target_identifier - Get the valid Cache or Bus check target identifier. + * @peidx: pointer of index of processor error section + * + * Return value: + * target address on Success / 0 on Failure + */ +static u64 +get_target_identifier(peidx_table_t *peidx) +{ + u64 target_address = 0; + sal_log_mod_error_info_t *smei; + pal_cache_check_info_t *pcci; + int i, level = 9; + + /* + * Look through the cache checks for a valid target identifier + * If more than one valid target identifier, return the one + * with the lowest cache level. + */ + for (i = 0; i < peidx_cache_check_num(peidx); i++) { + smei = (sal_log_mod_error_info_t *)peidx_cache_check(peidx, i); + if (smei->valid.target_identifier && smei->target_identifier) { + pcci = (pal_cache_check_info_t *)&(smei->check_info); + if (!target_address || (pcci->level < level)) { + target_address = smei->target_identifier; + level = pcci->level; + continue; + } + } + } + if (target_address) + return target_address; + + /* + * Look at the bus check for a valid target identifier + */ + smei = peidx_bus_check(peidx, 0); + if (smei && smei->valid.target_identifier) + return smei->target_identifier; + + return 0; +} + +/** * recover_from_read_error - Try to recover the errors which type are "read"s. * @slidx: pointer of index of SAL error record * @peidx: pointer of index of processor error section @@ -413,14 +494,15 @@ recover_from_read_error(slidx_table_t *slidx, peidx_table_t *peidx, pal_bus_check_info_t *pbci, struct ia64_sal_os_state *sos) { - sal_log_mod_error_info_t *smei; + u64 target_identifier; pal_min_state_area_t *pmsa; struct ia64_psr *psr1, *psr2; ia64_fptr_t *mca_hdlr_bh = (ia64_fptr_t*)mca_handler_bhhook; /* Is target address valid? */ - if (!pbci->tv) - return 0; + target_identifier = get_target_identifier(peidx); + if (!target_identifier) + return fatal_mca("target address not valid"); /* * cpu read or memory-mapped io read @@ -436,39 +518,46 @@ recover_from_read_error(slidx_table_t *slidx, * the process not have any locks of kernel. */ + /* Is minstate valid? */ + if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate)) + return fatal_mca("minstate not valid"); psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr); + psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr); /* * Check the privilege level of interrupted context. * If it is user-mode, then terminate affected process. */ - if (psr1->cpl != 0) { - smei = peidx_bus_check(peidx, 0); - if (smei->valid.target_identifier) { - /* - * setup for resume to bottom half of MCA, - * "mca_handler_bhhook" - */ - pmsa = sos->pal_min_state; - /* pass to bhhook as 1st argument (gr8) */ - pmsa->pmsa_gr[8-1] = smei->target_identifier; - /* set interrupted return address (but no use) */ - pmsa->pmsa_br0 = pmsa->pmsa_iip; - /* change resume address to bottom half */ - pmsa->pmsa_iip = mca_hdlr_bh->fp; - pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp; - /* set cpl with kernel mode */ - psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr; - psr2->cpl = 0; - psr2->ri = 0; - psr2->i = 0; - - return 1; - } + pmsa = sos->pal_min_state; + if (psr1->cpl != 0 || + ((psr2->cpl != 0) && mca_recover_range(pmsa->pmsa_iip))) { + /* + * setup for resume to bottom half of MCA, + * "mca_handler_bhhook" + */ + /* pass to bhhook as argument (gr8, ...) */ + pmsa->pmsa_gr[8-1] = target_identifier; + pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip; + pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr; + /* set interrupted return address (but no use) */ + pmsa->pmsa_br0 = pmsa->pmsa_iip; + /* change resume address to bottom half */ + pmsa->pmsa_iip = mca_hdlr_bh->fp; + pmsa->pmsa_gr[1-1] = mca_hdlr_bh->gp; + /* set cpl with kernel mode */ + psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr; + psr2->cpl = 0; + psr2->ri = 0; + psr2->bn = 1; + psr2->i = 0; + + return mca_recovered("user memory corruption. " + "kill affected process - recovered."); } - return 0; + return fatal_mca("kernel context not recovered, iip 0x%lx\n", + pmsa->pmsa_iip); } /** @@ -513,11 +602,40 @@ recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx, default: break; } + } else if (psp->cc && !psp->bc) { /* Cache error */ + status = recover_from_read_error(slidx, peidx, pbci, sos); } return status; } +/* + * recover_from_tlb_check + * @peidx: pointer of index of processor error section + * + * Return value: + * 1 on Success / 0 on Failure + */ +static int +recover_from_tlb_check(peidx_table_t *peidx) +{ + sal_log_mod_error_info_t *smei; + pal_tlb_check_info_t *ptci; + + smei = (sal_log_mod_error_info_t *)peidx_tlb_check(peidx, 0); + ptci = (pal_tlb_check_info_t *)&(smei->check_info); + + /* + * Look for signature of a duplicate TLB DTC entry, which is + * a SW bug and always fatal. + */ + if (ptci->op == PAL_TLB_CHECK_OP_PURGE + && !(ptci->itr || ptci->dtc || ptci->itc)) + return fatal_mca("Duplicate TLB entry"); + + return mca_recovered("TLB check recovered"); +} + /** * recover_from_processor_error * @platform: whether there are some platform error section or not @@ -529,13 +647,6 @@ recover_from_platform_error(slidx_table_t *slidx, peidx_table_t *peidx, * Return value: * 1 on Success / 0 on Failure */ -/* - * Later we try to recover when below all conditions are satisfied. - * 1. Only one processor error section is exist. - * 2. BUS_CHECK is exist and the others are not exist.(Except TLB_CHECK) - * 3. The entry of BUS_CHECK_INFO is 1. - * 4. "External bus error" flag is set and the others are not set. - */ static int recover_from_processor_error(int platform, slidx_table_t *slidx, @@ -546,43 +657,60 @@ recover_from_processor_error(int platform, slidx_table_t *slidx, (pal_processor_state_info_t*)peidx_psp(peidx); /* - * We cannot recover errors with other than bus_check. + * Processor recovery status must key off of the PAL recovery + * status in the Processor State Parameter. */ - if (psp->cc || psp->rc || psp->uc) - return 0; /* - * If there is no bus error, record is weird but we need not to recover. + * The machine check is corrected. */ - if (psp->bc == 0 || pbci == NULL) - return 1; + if (psp->cm == 1) + return mca_recovered("machine check is already corrected."); /* - * Sorry, we cannot handle so many. + * The error was not contained. Software must be reset. */ - if (peidx_bus_check_num(peidx) > 1) - return 0; + if (psp->us || psp->ci == 0) + return fatal_mca("error not contained"); + /* - * Well, here is only one bus error. + * Look for recoverable TLB check */ - if (pbci->ib || pbci->cc) - return 0; + if (psp->tc && !(psp->cc || psp->bc || psp->rc || psp->uc)) + return recover_from_tlb_check(peidx); + + /* + * The cache check and bus check bits have four possible states + * cc bc + * 1 1 Memory error, attempt recovery + * 1 0 Cache error, attempt recovery + * 0 1 I/O error, attempt recovery + * 0 0 Other error type, not recovered + */ + if (psp->cc == 0 && (psp->bc == 0 || pbci == NULL)) + return fatal_mca("No cache or bus check"); + + /* + * Cannot handle more than one bus check. + */ + if (peidx_bus_check_num(peidx) > 1) + return fatal_mca("Too many bus checks"); + + if (pbci->ib) + return fatal_mca("Internal Bus error"); if (pbci->eb && pbci->bsi > 0) - return 0; - if (psp->ci == 0) - return 0; + return fatal_mca("External bus check fatal status"); /* - * This is a local MCA and estimated as recoverble external bus error. - * (e.g. a load from poisoned memory) - * This means "there are some platform errors". + * This is a local MCA and estimated as a recoverable error. */ if (platform) return recover_from_platform_error(slidx, peidx, pbci, sos); + /* * On account of strange SAL error record, we cannot recover. */ - return 0; + return fatal_mca("Strange SAL record"); } /** @@ -611,12 +739,10 @@ mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos) /* Now, OS can recover when there is one processor error section */ if (n_proc_err > 1) - return 0; - else if (n_proc_err == 0) { - /* Weird SAL record ... We need not to recover */ - - return 1; - } + return fatal_mca("Too Many Errors"); + else if (n_proc_err == 0) + /* Weird SAL record ... We can't do anything */ + return fatal_mca("Weird SAL record"); /* Make index of processor error section */ mca_make_peidx((sal_log_processor_info_t*) @@ -627,7 +753,7 @@ mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos) /* Check whether MCA is global or not */ if (is_mca_global(&peidx, &pbci, sos)) - return 0; + return fatal_mca("global MCA"); /* Try to recover a processor error */ return recover_from_processor_error(platform_err, &slidx, &peidx, |
