diff options
Diffstat (limited to 'arch/x86/platform/uv')
| -rw-r--r-- | arch/x86/platform/uv/Makefile | 2 | ||||
| -rw-r--r-- | arch/x86/platform/uv/bios_uv.c | 2 | ||||
| -rw-r--r-- | arch/x86/platform/uv/tlb_uv.c | 537 | ||||
| -rw-r--r-- | arch/x86/platform/uv/uv_irq.c | 19 | ||||
| -rw-r--r-- | arch/x86/platform/uv/uv_nmi.c | 727 | ||||
| -rw-r--r-- | arch/x86/platform/uv/uv_time.c | 22 |
6 files changed, 1038 insertions, 271 deletions
diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile index 6c40995fefb..52079bebd01 100644 --- a/arch/x86/platform/uv/Makefile +++ b/arch/x86/platform/uv/Makefile @@ -1 +1 @@ -obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o +obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o uv_nmi.o diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 766612137a6..1584cbed0dc 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -39,7 +39,7 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) */ return BIOS_STATUS_UNIMPLEMENTED; - ret = efi_call6((void *)__va(tab->function), (u64)which, + ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5); return ret; } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3ae0e61abd2..dfe605ac1bc 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1,7 +1,7 @@ /* * SGI UltraViolet TLB flush routines. * - * (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI. + * (c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI. * * This code is released under the GNU General Public License version 2 or * later. @@ -38,8 +38,7 @@ static int timeout_base_ns[] = { static int timeout_us; static int nobau; -static int baudisabled; -static spinlock_t disable_lock; +static int nobau_perm; static cycles_t congested_cycles; /* tunables: */ @@ -47,12 +46,13 @@ static int max_concurr = MAX_BAU_CONCURRENT; static int max_concurr_const = MAX_BAU_CONCURRENT; static int plugged_delay = PLUGGED_DELAY; static int plugsb4reset = PLUGSB4RESET; +static int giveup_limit = GIVEUP_LIMIT; static int timeoutsb4reset = TIMEOUTSB4RESET; static int ipi_reset_limit = IPI_RESET_LIMIT; static int complete_threshold = COMPLETE_THRESHOLD; static int congested_respns_us = CONGESTED_RESPONSE_US; static int congested_reps = CONGESTED_REPS; -static int congested_period = CONGESTED_PERIOD; +static int disabled_period = DISABLED_PERIOD; static struct tunables tunables[] = { {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ @@ -63,7 +63,8 @@ static struct tunables tunables[] = { {&complete_threshold, COMPLETE_THRESHOLD}, {&congested_respns_us, CONGESTED_RESPONSE_US}, {&congested_reps, CONGESTED_REPS}, - {&congested_period, CONGESTED_PERIOD} + {&disabled_period, DISABLED_PERIOD}, + {&giveup_limit, GIVEUP_LIMIT} }; static struct dentry *tunables_dir; @@ -120,6 +121,40 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); +static void +set_bau_on(void) +{ + int cpu; + struct bau_control *bcp; + + if (nobau_perm) { + pr_info("BAU not initialized; cannot be turned on\n"); + return; + } + nobau = 0; + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->nobau = 0; + } + pr_info("BAU turned on\n"); + return; +} + +static void +set_bau_off(void) +{ + int cpu; + struct bau_control *bcp; + + nobau = 1; + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->nobau = 1; + } + pr_info("BAU turned off\n"); + return; +} + /* * Determine the first node on a uvhub. 'Nodes' are used for kernel * memory allocation. @@ -278,7 +313,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, * Both sockets dump their completed count total into * the message's count. */ - smaster->socket_acknowledge_count[mdp->msg_slot] = 0; + *sp = 0; asp = (struct atomic_short *)&msg->acknowledge_count; msg_ack_count = atom_asr(socket_ack_count, asp); @@ -398,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) return; } -static inline unsigned long cycles_2_us(unsigned long long cyc) +/* + * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative + * number, not an absolute. It converts a duration in cycles to a duration in + * ns. + */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc) { + struct cyc2ns_data *data = cyc2ns_read_begin(); unsigned long long ns; - unsigned long us; - int cpu = smp_processor_id(); - ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; - us = ns / 1000; - return us; + ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + + cyc2ns_read_end(data); + return ns; +} + +/* + * The reverse of the above; converts a duration in ns to a duration in cycles. + */ +static inline unsigned long long ns_2_cycles(unsigned long long ns) +{ + struct cyc2ns_data *data = cyc2ns_read_begin(); + unsigned long long cyc; + + cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; + + cyc2ns_read_end(data); + return cyc; +} + +static inline unsigned long cycles_2_us(unsigned long long cyc) +{ + return cycles_2_ns(cyc) / NSEC_PER_USEC; +} + +static inline cycles_t sec_2_cycles(unsigned long sec) +{ + return ns_2_cycles(sec * NSEC_PER_SEC); +} + +static inline unsigned long long usec_2_cycles(unsigned long usec) +{ + return ns_2_cycles(usec * NSEC_PER_USEC); } /* @@ -491,16 +560,15 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, } /* - * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. + * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register. + * But not currently used. */ static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) { unsigned long descriptor_status; - unsigned long descriptor_status2; - descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); - descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; - descriptor_status = (descriptor_status << 1) | descriptor_status2; + descriptor_status = + ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1; return descriptor_status; } @@ -531,87 +599,11 @@ int normal_busy(struct bau_control *bcp) */ int handle_uv2_busy(struct bau_control *bcp) { - int busy_one = bcp->using_desc; - int normal = bcp->uvhub_cpu; - int selected = -1; - int i; - unsigned long descriptor_status; - unsigned long status; - int mmr_offset; - struct bau_desc *bau_desc_old; - struct bau_desc *bau_desc_new; - struct bau_control *hmaster = bcp->uvhub_master; struct ptc_stats *stat = bcp->statp; - cycles_t ttm; stat->s_uv2_wars++; - spin_lock(&hmaster->uvhub_lock); - /* try for the original first */ - if (busy_one != normal) { - if (!normal_busy(bcp)) - selected = normal; - } - if (selected < 0) { - /* can't use the normal, select an alternate */ - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - descriptor_status = read_lmmr(mmr_offset); - - /* scan available descriptors 32-63 */ - for (i = 0; i < UV_CPUS_PER_AS; i++) { - if ((hmaster->inuse_map & (1 << i)) == 0) { - status = ((descriptor_status >> - (i * UV_ACT_STATUS_SIZE)) & - UV_ACT_STATUS_MASK) << 1; - if (status != UV2H_DESC_BUSY) { - selected = i + UV_CPUS_PER_AS; - break; - } - } - } - } - - if (busy_one != normal) - /* mark the busy alternate as not in-use */ - hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); - - if (selected >= 0) { - /* switch to the selected descriptor */ - if (selected != normal) { - /* set the selected alternate as in-use */ - hmaster->inuse_map |= - (1 << (selected - UV_CPUS_PER_AS)); - if (selected > stat->s_uv2_wars_hw) - stat->s_uv2_wars_hw = selected; - } - bau_desc_old = bcp->descriptor_base; - bau_desc_old += (ITEMS_PER_DESC * busy_one); - bcp->using_desc = selected; - bau_desc_new = bcp->descriptor_base; - bau_desc_new += (ITEMS_PER_DESC * selected); - *bau_desc_new = *bau_desc_old; - } else { - /* - * All are busy. Wait for the normal one for this cpu to - * free up. - */ - stat->s_uv2_war_waits++; - spin_unlock(&hmaster->uvhub_lock); - ttm = get_cycles(); - do { - cpu_relax(); - } while (normal_busy(bcp)); - spin_lock(&hmaster->uvhub_lock); - /* switch to the original descriptor */ - bcp->using_desc = normal; - bau_desc_old = bcp->descriptor_base; - bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); - bcp->using_desc = (ITEMS_PER_DESC * normal); - bau_desc_new = bcp->descriptor_base; - bau_desc_new += (ITEMS_PER_DESC * normal); - *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ - } - spin_unlock(&hmaster->uvhub_lock); - return FLUSH_RETRY_BUSYBUG; + bcp->busy = 1; + return FLUSH_GIVEUP; } static int uv2_wait_completion(struct bau_desc *bau_desc, @@ -620,7 +612,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, { unsigned long descriptor_stat; cycles_t ttm; - int desc = bcp->using_desc; + int desc = bcp->uvhub_cpu; long busy_reps = 0; struct ptc_stats *stat = bcp->statp; @@ -628,24 +620,38 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, /* spin on the status MMR, waiting for it to go idle */ while (descriptor_stat != UV2H_DESC_IDLE) { - /* - * Our software ack messages may be blocked because - * there are no swack resources available. As long - * as none of them has timed out hardware will NACK - * our message and its state will stay IDLE. - */ - if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || - (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { + if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) { + /* + * A h/w bug on the destination side may + * have prevented the message being marked + * pending, thus it doesn't get replied to + * and gets continually nacked until it times + * out with a SOURCE_TIMEOUT. + */ stat->s_stimeout++; return FLUSH_GIVEUP; - } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) { - stat->s_strongnacks++; - bcp->conseccompletes = 0; - return FLUSH_GIVEUP; } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { + ttm = get_cycles(); + + /* + * Our retries may be blocked by all destination + * swack resources being consumed, and a timeout + * pending. In that case hardware returns the + * ERROR that looks like a destination timeout. + * Without using the extended status we have to + * deduce from the short time that this was a + * strong nack. + */ + if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { + bcp->conseccompletes = 0; + stat->s_plugged++; + /* FLUSH_RETRY_PLUGGED causes hang on boot */ + return FLUSH_GIVEUP; + } stat->s_dtimeout++; bcp->conseccompletes = 0; - return FLUSH_RETRY_TIMEOUT; + /* FLUSH_RETRY_TIMEOUT causes hang on boot */ + return FLUSH_GIVEUP; } else { busy_reps++; if (busy_reps > 1000000) { @@ -653,9 +659,8 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, busy_reps = 0; ttm = get_cycles(); if ((ttm - bcp->send_message) > - (bcp->clocks_per_100_usec)) { + bcp->timeout_interval) return handle_uv2_busy(bcp); - } } /* * descriptor_stat is still BUSY @@ -679,7 +684,7 @@ static int wait_completion(struct bau_desc *bau_desc, { int right_shift; unsigned long mmr_offset; - int desc = bcp->using_desc; + int desc = bcp->uvhub_cpu; if (desc < UV_CPUS_PER_AS) { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; @@ -697,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc, bcp, try); } -static inline cycles_t sec_2_cycles(unsigned long sec) -{ - unsigned long ns; - cycles_t cyc; - - ns = sec * 1000000000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - /* * Our retries are blocked by all destination sw ack resources being * in use, and a timeout is pending. In that case hardware immediately @@ -758,33 +753,31 @@ static void destination_timeout(struct bau_desc *bau_desc, } /* - * Completions are taking a very long time due to a congested numalink - * network. + * Stop all cpus on a uvhub from using the BAU for a period of time. + * This is reversed by check_enable. */ -static void disable_for_congestion(struct bau_control *bcp, - struct ptc_stats *stat) -{ - /* let only one cpu do this disabling */ - spin_lock(&disable_lock); - - if (!baudisabled && bcp->period_requests && - ((bcp->period_time / bcp->period_requests) > congested_cycles)) { - int tcpu; - struct bau_control *tbcp; - /* it becomes this cpu's job to turn on the use of the - BAU again */ - baudisabled = 1; - bcp->set_bau_off = 1; - bcp->set_bau_on_time = get_cycles(); - bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period); +static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat) +{ + int tcpu; + struct bau_control *tbcp; + struct bau_control *hmaster; + cycles_t tm1; + + hmaster = bcp->uvhub_master; + spin_lock(&hmaster->disable_lock); + if (!bcp->baudisabled) { stat->s_bau_disabled++; + tm1 = get_cycles(); for_each_present_cpu(tcpu) { tbcp = &per_cpu(bau_control, tcpu); - tbcp->baudisabled = 1; + if (tbcp->uvhub_master == hmaster) { + tbcp->baudisabled = 1; + tbcp->set_bau_on_time = + tm1 + bcp->disabled_period; + } } } - - spin_unlock(&disable_lock); + spin_unlock(&hmaster->disable_lock); } static void count_max_concurr(int stat, struct bau_control *bcp, @@ -815,16 +808,30 @@ static void record_send_stats(cycles_t time1, cycles_t time2, bcp->period_requests++; bcp->period_time += elapsed; if ((elapsed > congested_cycles) && - (bcp->period_requests > bcp->cong_reps)) - disable_for_congestion(bcp, stat); + (bcp->period_requests > bcp->cong_reps) && + ((bcp->period_time / bcp->period_requests) > + congested_cycles)) { + stat->s_congested++; + disable_for_period(bcp, stat); + } } } else stat->s_requestor--; if (completion_status == FLUSH_COMPLETE && try > 1) stat->s_retriesok++; - else if (completion_status == FLUSH_GIVEUP) + else if (completion_status == FLUSH_GIVEUP) { stat->s_giveup++; + if (get_cycles() > bcp->period_end) + bcp->period_giveups = 0; + bcp->period_giveups++; + if (bcp->period_giveups == 1) + bcp->period_end = get_cycles() + bcp->disabled_period; + if (bcp->period_giveups > bcp->giveup_limit) { + disable_for_period(bcp, stat); + stat->s_giveuplimit++; + } + } } /* @@ -868,7 +875,8 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, * Returns 1 if it gives up entirely and the original cpu mask is to be * returned to the kernel. */ -int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, + struct bau_desc *bau_desc) { int seq_number = 0; int completion_stat = 0; @@ -881,24 +889,23 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) struct bau_control *hmaster = bcp->uvhub_master; struct uv1_bau_msg_header *uv1_hdr = NULL; struct uv2_bau_msg_header *uv2_hdr = NULL; - struct bau_desc *bau_desc; - if (bcp->uvhub_version == 1) + if (bcp->uvhub_version == 1) { + uv1 = 1; uv1_throttle(hmaster, stat); + } while (hmaster->uvhub_quiesce) cpu_relax(); time1 = get_cycles(); + if (uv1) + uv1_hdr = &bau_desc->header.uv1_hdr; + else + uv2_hdr = &bau_desc->header.uv2_hdr; + do { - bau_desc = bcp->descriptor_base; - bau_desc += (ITEMS_PER_DESC * bcp->using_desc); - if (bcp->uvhub_version == 1) { - uv1 = 1; - uv1_hdr = &bau_desc->header.uv1_hdr; - } else - uv2_hdr = &bau_desc->header.uv2_hdr; - if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { + if (try == 0) { if (uv1) uv1_hdr->msg_type = MSG_REGULAR; else @@ -916,25 +923,24 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) uv1_hdr->sequence = seq_number; else uv2_hdr->sequence = seq_number; - index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; + index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; bcp->send_message = get_cycles(); write_mmr_activation(index); try++; completion_stat = wait_completion(bau_desc, bcp, try); - /* UV2: wait_completion() may change the bcp->using_desc */ handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { bcp->ipi_attempts = 0; + stat->s_overipilimit++; completion_stat = FLUSH_GIVEUP; break; } cpu_relax(); } while ((completion_stat == FLUSH_RETRY_PLUGGED) || - (completion_stat == FLUSH_RETRY_BUSYBUG) || (completion_stat == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); @@ -955,28 +961,33 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) } /* - * The BAU is disabled. When the disabled time period has expired, the cpu - * that disabled it must re-enable it. - * Return 0 if it is re-enabled for all cpus. + * The BAU is disabled for this uvhub. When the disabled time period has + * expired re-enable it. + * Return 0 if it is re-enabled for all cpus on this uvhub. */ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat) { int tcpu; struct bau_control *tbcp; + struct bau_control *hmaster; - if (bcp->set_bau_off) { - if (get_cycles() >= bcp->set_bau_on_time) { - stat->s_bau_reenabled++; - baudisabled = 0; - for_each_present_cpu(tcpu) { - tbcp = &per_cpu(bau_control, tcpu); + hmaster = bcp->uvhub_master; + spin_lock(&hmaster->disable_lock); + if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) { + stat->s_bau_reenabled++; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + if (tbcp->uvhub_master == hmaster) { tbcp->baudisabled = 0; tbcp->period_requests = 0; tbcp->period_time = 0; + tbcp->period_giveups = 0; } - return 0; } + spin_unlock(&hmaster->disable_lock); + return 0; } + spin_unlock(&hmaster->disable_lock); return -1; } @@ -1047,7 +1058,8 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, * globally purge translation cache of a virtual address or all TLB's * @cpumask: mask of all cpu's in which the address is to be removed * @mm: mm_struct containing virtual address range - * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) + * @start: start virtual address to be removed from TLB + * @end: end virtual address to be remove from TLB * @cpu: the current cpu * * This is the entry point for initiating any UV global TLB shootdown. @@ -1068,8 +1080,8 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, * done. The returned pointer is valid till preemption is re-enabled. */ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, unsigned long va, - unsigned int cpu) + struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned int cpu) { int locals = 0; int remotes = 0; @@ -1078,18 +1090,33 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct cpumask *flush_mask; struct ptc_stats *stat; struct bau_control *bcp; + unsigned long descriptor_status; + unsigned long status; + + bcp = &per_cpu(bau_control, cpu); - /* kernel was booted 'nobau' */ - if (nobau) + if (bcp->nobau) return cpumask; - bcp = &per_cpu(bau_control, cpu); stat = bcp->statp; + stat->s_enters++; + + if (bcp->busy) { + descriptor_status = + read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0); + status = ((descriptor_status >> (bcp->uvhub_cpu * + UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1; + if (status == UV2H_DESC_BUSY) + return cpumask; + bcp->busy = 0; + } /* bau was disabled due to slow response */ if (bcp->baudisabled) { - if (check_enable(bcp, stat)) + if (check_enable(bcp, stat)) { + stat->s_ipifordisabled++; return cpumask; + } } /* @@ -1105,38 +1132,43 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, stat->s_ntargself++; bau_desc = bcp->descriptor_base; - bau_desc += (ITEMS_PER_DESC * bcp->using_desc); + bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu); bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) return NULL; record_send_statistics(stat, locals, hubs, remotes, bau_desc); - bau_desc->payload.address = va; + if (!end || (end - start) <= PAGE_SIZE) + bau_desc->payload.address = start; + else + bau_desc->payload.address = TLB_FLUSH_ALL; bau_desc->payload.sending_cpu = cpu; /* * uv_flush_send_and_wait returns 0 if all cpu's were messaged, * or 1 if it gave up and the original cpumask should be returned. */ - if (!uv_flush_send_and_wait(flush_mask, bcp)) + if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc)) return NULL; else return cpumask; } /* - * Search the message queue for any 'other' message with the same software - * acknowledge resource bit vector. + * Search the message queue for any 'other' unprocessed message with the + * same software acknowledge resource bit vector as the 'msg' message. */ struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, - struct bau_control *bcp, unsigned char swack_vec) + struct bau_control *bcp) { struct bau_pq_entry *msg_next = msg + 1; + unsigned char swack_vec = msg->swack_vec; if (msg_next > bcp->queue_last) msg_next = bcp->queue_first; - while ((msg_next->swack_vec != 0) && (msg_next != msg)) { - if (msg_next->swack_vec == swack_vec) + while (msg_next != msg) { + if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) && + (msg_next->swack_vec == swack_vec)) return msg_next; msg_next++; if (msg_next > bcp->queue_last) @@ -1165,32 +1197,30 @@ void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) * This message was assigned a swack resource, but no * reserved acknowlegment is pending. * The bug has prevented this message from setting the MMR. - * And no other message has used the same sw_ack resource. - * Do the requested shootdown but do not reply to the msg. - * (the 0 means make no acknowledge) */ - bau_process_message(mdp, bcp, 0); - return; - } - - /* - * Some message has set the MMR 'pending' bit; it might have been - * another message. Look for that message. - */ - other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); - if (other_msg) { - /* There is another. Do not ack the current one. */ - bau_process_message(mdp, bcp, 0); /* - * Let the natural processing of that message acknowledge - * it. Don't get the processing of sw_ack's out of order. + * Some message has set the MMR 'pending' bit; it might have + * been another message. Look for that message. */ - return; + other_msg = find_another_by_swack(msg, bcp); + if (other_msg) { + /* + * There is another. Process this one but do not + * ack it. + */ + bau_process_message(mdp, bcp, 0); + /* + * Let the natural processing of that other message + * acknowledge it. Don't get the processing of sw_ack's + * out of order. + */ + return; + } } /* - * There is no other message using this sw_ack, so it is safe to - * acknowledge it. + * Either the MMR shows this one pending a reply or there is no + * other message using this sw_ack, so it is safe to acknowledge it. */ bau_process_message(mdp, bcp, 1); @@ -1295,8 +1325,8 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { - mmr_image &= ~(1L << UV2_LEG_SHFT); - mmr_image |= (1L << UV2_EXT_SHFT); + /* hw bug workaround; do not use extended status */ + mmr_image &= ~(1L << UV2_EXT_SHFT); } write_mmr_misc_control(pnode, mmr_image); } @@ -1321,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data) { } -static inline unsigned long long usec_2_cycles(unsigned long microsec) -{ - unsigned long ns; - unsigned long long cyc; - - ns = microsec * 1000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - /* * Display the statistics thru /proc/sgi_uv/ptc_statistics * 'data' points to the cpu number @@ -1339,29 +1359,34 @@ static inline unsigned long long usec_2_cycles(unsigned long microsec) static int ptc_seq_show(struct seq_file *file, void *data) { struct ptc_stats *stat; + struct bau_control *bcp; int cpu; cpu = *(loff_t *)data; if (!cpu) { seq_printf(file, - "# cpu sent stime self locals remotes ncpus localhub "); + "# cpu bauoff sent stime self locals remotes ncpus localhub "); seq_printf(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); + "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries "); + seq_printf(file, + "rok resetp resett giveup sto bz throt disable "); seq_printf(file, - "resetp resett giveup sto bz throt swack recv rtime "); + "enable wars warshw warwaits enters ipidis plugged "); seq_printf(file, - "all one mult none retry canc nocan reset rcan "); + "ipiover glim cong swack recv rtime all one mult "); seq_printf(file, - "disable enable wars warshw warwaits\n"); + "none retry canc nocan reset rcan\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { - stat = &per_cpu(ptcstats, cpu); + bcp = &per_cpu(bau_control, cpu); + stat = bcp->statp; /* source side statistics */ seq_printf(file, - "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", - cpu, stat->s_requestor, cycles_2_us(stat->s_time), + "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + cpu, bcp->nobau, stat->s_requestor, + cycles_2_us(stat->s_time), stat->s_ntargself, stat->s_ntarglocals, stat->s_ntargremotes, stat->s_ntargcpu, stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, @@ -1375,20 +1400,23 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->s_resets_plug, stat->s_resets_timeout, stat->s_giveup, stat->s_stimeout, stat->s_busy, stat->s_throttles); + seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + stat->s_bau_disabled, stat->s_bau_reenabled, + stat->s_uv2_wars, stat->s_uv2_wars_hw, + stat->s_uv2_war_waits, stat->s_enters, + stat->s_ipifordisabled, stat->s_plugged, + stat->s_overipilimit, stat->s_giveuplimit, + stat->s_congested); /* destination side statistics */ seq_printf(file, - "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)), stat->d_requestee, cycles_2_us(stat->d_time), stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, stat->d_nomsg, stat->d_retries, stat->d_canceled, stat->d_nocanceled, stat->d_resets, stat->d_rcanceled); - seq_printf(file, "%ld %ld %ld %ld %ld\n", - stat->s_bau_disabled, stat->s_bau_reenabled, - stat->s_uv2_wars, stat->s_uv2_wars_hw, - stat->s_uv2_war_waits); } return 0; } @@ -1402,13 +1430,14 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf, char *buf; int ret; - buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", - "max_concur plugged_delay plugsb4reset", - "timeoutsb4reset ipi_reset_limit complete_threshold", - "congested_response_us congested_reps congested_period", + buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n", + "max_concur plugged_delay plugsb4reset timeoutsb4reset", + "ipi_reset_limit complete_threshold congested_response_us", + "congested_reps disabled_period giveup_limit", max_concurr, plugged_delay, plugsb4reset, timeoutsb4reset, ipi_reset_limit, complete_threshold, - congested_respns_us, congested_reps, congested_period); + congested_respns_us, congested_reps, disabled_period, + giveup_limit); if (!buf) return -ENOMEM; @@ -1439,13 +1468,21 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user, return -EFAULT; optstr[count - 1] = '\0'; + if (!strcmp(optstr, "on")) { + set_bau_on(); + return count; + } else if (!strcmp(optstr, "off")) { + set_bau_off(); + return count; + } + if (strict_strtol(optstr, 10, &input_arg) < 0) { printk(KERN_DEBUG "%s is invalid\n", optstr); return -EINVAL; } if (input_arg == 0) { - elements = sizeof(stat_description)/sizeof(*stat_description); + elements = ARRAY_SIZE(stat_description); printk(KERN_DEBUG "# cpu: cpu number\n"); printk(KERN_DEBUG "Sender statistics:\n"); for (i = 0; i < elements; i++) @@ -1486,7 +1523,7 @@ static int parse_tunables_write(struct bau_control *bcp, char *instr, char *q; int cnt = 0; int val; - int e = sizeof(tunables) / sizeof(*tunables); + int e = ARRAY_SIZE(tunables); p = instr + strspn(instr, WHITESPACE); q = p; @@ -1571,7 +1608,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user, bcp->complete_threshold = complete_threshold; bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; - bcp->cong_period = congested_period; + bcp->disabled_period = sec_2_cycles(disabled_period); + bcp->giveup_limit = giveup_limit; } return count; } @@ -1700,6 +1738,10 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) * fairness chaining multilevel count replied_to */ } else { + /* + * BIOS uses legacy mode, but UV2 hardware always + * uses native mode for selective broadcasts. + */ uv2_hdr = &bd2->header.uv2_hdr; uv2_hdr->swack_flag = 1; uv2_hdr->base_dest_nasid = @@ -1812,8 +1854,8 @@ static int calculate_destination_timeout(void) index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; - base = timeout_base_ns[index]; - ts_ns = base * mult1 * mult2; + ts_ns = timeout_base_ns[index]; + ts_ns *= (mult1 * mult2); ret = ts_ns / 1000; } else { /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ @@ -1837,6 +1879,8 @@ static void __init init_per_cpu_tunables(void) for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); bcp->baudisabled = 0; + if (nobau) + bcp->nobau = 1; bcp->statp = &per_cpu(ptcstats, cpu); /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = usec_2_cycles(2*timeout_us); @@ -1849,10 +1893,11 @@ static void __init init_per_cpu_tunables(void) bcp->complete_threshold = complete_threshold; bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; - bcp->cong_period = congested_period; - bcp->clocks_per_100_usec = usec_2_cycles(100); + bcp->disabled_period = sec_2_cycles(disabled_period); + bcp->giveup_limit = giveup_limit; spin_lock_init(&bcp->queue_lock); spin_lock_init(&bcp->uvhub_lock); + spin_lock_init(&bcp->disable_lock); } } @@ -1973,7 +2018,6 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, } bcp->uvhub_master = *hmasterp; bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; - bcp->using_desc = bcp->uvhub_cpu; if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { printk(KERN_EMERG "%d cpus per uvhub invalid\n", bcp->uvhub_cpu); @@ -2070,16 +2114,12 @@ static int __init uv_bau_init(void) if (!is_uv_system()) return 0; - if (nobau) - return 0; - for_each_possible_cpu(cur_cpu) { mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); } nuvhubs = uv_num_possible_blades(); - spin_lock_init(&disable_lock); congested_cycles = usec_2_cycles(congested_respns_us); uv_base_pnode = 0x7fffffff; @@ -2092,7 +2132,8 @@ static int __init uv_bau_init(void) enable_timeouts(); if (init_per_cpu(nuvhubs, uv_base_pnode)) { - nobau = 1; + set_bau_off(); + nobau_perm = 1; return 0; } diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index f25c2765a5c..b233681af4d 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -135,6 +135,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; int mmr_pnode, err; + unsigned int dest; BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); @@ -143,6 +144,10 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, if (err != 0) return err; + err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest); + if (err != 0) + return err; + if (limit == UV_AFFINITY_CPU) irq_set_status_flags(irq, IRQ_NO_BALANCING); else @@ -159,7 +164,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, entry->polarity = 0; entry->trigger = 0; entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + entry->dest = dest; mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -222,7 +227,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, if (cfg->move_in_progress) send_cleanup_vector(cfg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } /* @@ -233,11 +238,9 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, unsigned long mmr_offset, int limit) { - int irq, ret; - - irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade)); + int ret, irq = irq_alloc_hwirq(uv_blade_to_memory_nid(mmr_blade)); - if (irq <= 0) + if (!irq) return -EBUSY; ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, @@ -245,7 +248,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, if (ret == irq) uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); else - destroy_irq(irq); + irq_free_hwirq(irq); return ret; } @@ -280,6 +283,6 @@ void uv_teardown_irq(unsigned int irq) n = n->rb_right; } spin_unlock_irqrestore(&uv_irq_lock, irqflags); - destroy_irq(irq); + irq_free_hwirq(irq); } EXPORT_SYMBOL_GPL(uv_teardown_irq); diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c new file mode 100644 index 00000000000..c89c93320c1 --- /dev/null +++ b/arch/x86/platform/uv/uv_nmi.c @@ -0,0 +1,727 @@ +/* + * SGI NMI support routines + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2009-2013 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Mike Travis + */ + +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/kdb.h> +#include <linux/kexec.h> +#include <linux/kgdb.h> +#include <linux/module.h> +#include <linux/nmi.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include <asm/apic.h> +#include <asm/current.h> +#include <asm/kdebug.h> +#include <asm/local64.h> +#include <asm/nmi.h> +#include <asm/traps.h> +#include <asm/uv/uv.h> +#include <asm/uv/uv_hub.h> +#include <asm/uv/uv_mmrs.h> + +/* + * UV handler for NMI + * + * Handle system-wide NMI events generated by the global 'power nmi' command. + * + * Basic operation is to field the NMI interrupt on each cpu and wait + * until all cpus have arrived into the nmi handler. If some cpus do not + * make it into the handler, try and force them in with the IPI(NMI) signal. + * + * We also have to lessen UV Hub MMR accesses as much as possible as this + * disrupts the UV Hub's primary mission of directing NumaLink traffic and + * can cause system problems to occur. + * + * To do this we register our primary NMI notifier on the NMI_UNKNOWN + * chain. This reduces the number of false NMI calls when the perf + * tools are running which generate an enormous number of NMIs per + * second (~4M/s for 1024 cpu threads). Our secondary NMI handler is + * very short as it only checks that if it has been "pinged" with the + * IPI(NMI) signal as mentioned above, and does not read the UV Hub's MMR. + * + */ + +static struct uv_hub_nmi_s **uv_hub_nmi_list; + +DEFINE_PER_CPU(struct uv_cpu_nmi_s, __uv_cpu_nmi); +EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_nmi); + +static unsigned long nmi_mmr; +static unsigned long nmi_mmr_clear; +static unsigned long nmi_mmr_pending; + +static atomic_t uv_in_nmi; +static atomic_t uv_nmi_cpu = ATOMIC_INIT(-1); +static atomic_t uv_nmi_cpus_in_nmi = ATOMIC_INIT(-1); +static atomic_t uv_nmi_slave_continue; +static cpumask_var_t uv_nmi_cpu_mask; + +/* Values for uv_nmi_slave_continue */ +#define SLAVE_CLEAR 0 +#define SLAVE_CONTINUE 1 +#define SLAVE_EXIT 2 + +/* + * Default is all stack dumps go to the console and buffer. + * Lower level to send to log buffer only. + */ +static int uv_nmi_loglevel = CONSOLE_LOGLEVEL_DEFAULT; +module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644); + +/* + * The following values show statistics on how perf events are affecting + * this system. + */ +static int param_get_local64(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%lu\n", local64_read((local64_t *)kp->arg)); +} + +static int param_set_local64(const char *val, const struct kernel_param *kp) +{ + /* clear on any write */ + local64_set((local64_t *)kp->arg, 0); + return 0; +} + +static struct kernel_param_ops param_ops_local64 = { + .get = param_get_local64, + .set = param_set_local64, +}; +#define param_check_local64(name, p) __param_check(name, p, local64_t) + +static local64_t uv_nmi_count; +module_param_named(nmi_count, uv_nmi_count, local64, 0644); + +static local64_t uv_nmi_misses; +module_param_named(nmi_misses, uv_nmi_misses, local64, 0644); + +static local64_t uv_nmi_ping_count; +module_param_named(ping_count, uv_nmi_ping_count, local64, 0644); + +static local64_t uv_nmi_ping_misses; +module_param_named(ping_misses, uv_nmi_ping_misses, local64, 0644); + +/* + * Following values allow tuning for large systems under heavy loading + */ +static int uv_nmi_initial_delay = 100; +module_param_named(initial_delay, uv_nmi_initial_delay, int, 0644); + +static int uv_nmi_slave_delay = 100; +module_param_named(slave_delay, uv_nmi_slave_delay, int, 0644); + +static int uv_nmi_loop_delay = 100; +module_param_named(loop_delay, uv_nmi_loop_delay, int, 0644); + +static int uv_nmi_trigger_delay = 10000; +module_param_named(trigger_delay, uv_nmi_trigger_delay, int, 0644); + +static int uv_nmi_wait_count = 100; +module_param_named(wait_count, uv_nmi_wait_count, int, 0644); + +static int uv_nmi_retry_count = 500; +module_param_named(retry_count, uv_nmi_retry_count, int, 0644); + +/* + * Valid NMI Actions: + * "dump" - dump process stack for each cpu + * "ips" - dump IP info for each cpu + * "kdump" - do crash dump + * "kdb" - enter KDB (default) + * "kgdb" - enter KGDB + */ +static char uv_nmi_action[8] = "kdb"; +module_param_string(action, uv_nmi_action, sizeof(uv_nmi_action), 0644); + +static inline bool uv_nmi_action_is(const char *action) +{ + return (strncmp(uv_nmi_action, action, strlen(action)) == 0); +} + +/* Setup which NMI support is present in system */ +static void uv_nmi_setup_mmrs(void) +{ + if (uv_read_local_mmr(UVH_NMI_MMRX_SUPPORTED)) { + uv_write_local_mmr(UVH_NMI_MMRX_REQ, + 1UL << UVH_NMI_MMRX_REQ_SHIFT); + nmi_mmr = UVH_NMI_MMRX; + nmi_mmr_clear = UVH_NMI_MMRX_CLEAR; + nmi_mmr_pending = 1UL << UVH_NMI_MMRX_SHIFT; + pr_info("UV: SMI NMI support: %s\n", UVH_NMI_MMRX_TYPE); + } else { + nmi_mmr = UVH_NMI_MMR; + nmi_mmr_clear = UVH_NMI_MMR_CLEAR; + nmi_mmr_pending = 1UL << UVH_NMI_MMR_SHIFT; + pr_info("UV: SMI NMI support: %s\n", UVH_NMI_MMR_TYPE); + } +} + +/* Read NMI MMR and check if NMI flag was set by BMC. */ +static inline int uv_nmi_test_mmr(struct uv_hub_nmi_s *hub_nmi) +{ + hub_nmi->nmi_value = uv_read_local_mmr(nmi_mmr); + atomic_inc(&hub_nmi->read_mmr_count); + return !!(hub_nmi->nmi_value & nmi_mmr_pending); +} + +static inline void uv_local_mmr_clear_nmi(void) +{ + uv_write_local_mmr(nmi_mmr_clear, nmi_mmr_pending); +} + +/* + * If first cpu in on this hub, set hub_nmi "in_nmi" and "owner" values and + * return true. If first cpu in on the system, set global "in_nmi" flag. + */ +static int uv_set_in_nmi(int cpu, struct uv_hub_nmi_s *hub_nmi) +{ + int first = atomic_add_unless(&hub_nmi->in_nmi, 1, 1); + + if (first) { + atomic_set(&hub_nmi->cpu_owner, cpu); + if (atomic_add_unless(&uv_in_nmi, 1, 1)) + atomic_set(&uv_nmi_cpu, cpu); + + atomic_inc(&hub_nmi->nmi_count); + } + return first; +} + +/* Check if this is a system NMI event */ +static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi) +{ + int cpu = smp_processor_id(); + int nmi = 0; + + local64_inc(&uv_nmi_count); + uv_cpu_nmi.queries++; + + do { + nmi = atomic_read(&hub_nmi->in_nmi); + if (nmi) + break; + + if (raw_spin_trylock(&hub_nmi->nmi_lock)) { + + /* check hub MMR NMI flag */ + if (uv_nmi_test_mmr(hub_nmi)) { + uv_set_in_nmi(cpu, hub_nmi); + nmi = 1; + break; + } + + /* MMR NMI flag is clear */ + raw_spin_unlock(&hub_nmi->nmi_lock); + + } else { + /* wait a moment for the hub nmi locker to set flag */ + cpu_relax(); + udelay(uv_nmi_slave_delay); + + /* re-check hub in_nmi flag */ + nmi = atomic_read(&hub_nmi->in_nmi); + if (nmi) + break; + } + + /* check if this BMC missed setting the MMR NMI flag */ + if (!nmi) { + nmi = atomic_read(&uv_in_nmi); + if (nmi) + uv_set_in_nmi(cpu, hub_nmi); + } + + } while (0); + + if (!nmi) + local64_inc(&uv_nmi_misses); + + return nmi; +} + +/* Need to reset the NMI MMR register, but only once per hub. */ +static inline void uv_clear_nmi(int cpu) +{ + struct uv_hub_nmi_s *hub_nmi = uv_hub_nmi; + + if (cpu == atomic_read(&hub_nmi->cpu_owner)) { + atomic_set(&hub_nmi->cpu_owner, -1); + atomic_set(&hub_nmi->in_nmi, 0); + uv_local_mmr_clear_nmi(); + raw_spin_unlock(&hub_nmi->nmi_lock); + } +} + +/* Print non-responding cpus */ +static void uv_nmi_nr_cpus_pr(char *fmt) +{ + static char cpu_list[1024]; + int len = sizeof(cpu_list); + int c = cpumask_weight(uv_nmi_cpu_mask); + int n = cpulist_scnprintf(cpu_list, len, uv_nmi_cpu_mask); + + if (n >= len-1) + strcpy(&cpu_list[len - 6], "...\n"); + + printk(fmt, c, cpu_list); +} + +/* Ping non-responding cpus attemping to force them into the NMI handler */ +static void uv_nmi_nr_cpus_ping(void) +{ + int cpu; + + for_each_cpu(cpu, uv_nmi_cpu_mask) + atomic_set(&uv_cpu_nmi_per(cpu).pinging, 1); + + apic->send_IPI_mask(uv_nmi_cpu_mask, APIC_DM_NMI); +} + +/* Clean up flags for cpus that ignored both NMI and ping */ +static void uv_nmi_cleanup_mask(void) +{ + int cpu; + + for_each_cpu(cpu, uv_nmi_cpu_mask) { + atomic_set(&uv_cpu_nmi_per(cpu).pinging, 0); + atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_OUT); + cpumask_clear_cpu(cpu, uv_nmi_cpu_mask); + } +} + +/* Loop waiting as cpus enter nmi handler */ +static int uv_nmi_wait_cpus(int first) +{ + int i, j, k, n = num_online_cpus(); + int last_k = 0, waiting = 0; + + if (first) { + cpumask_copy(uv_nmi_cpu_mask, cpu_online_mask); + k = 0; + } else { + k = n - cpumask_weight(uv_nmi_cpu_mask); + } + + udelay(uv_nmi_initial_delay); + for (i = 0; i < uv_nmi_retry_count; i++) { + int loop_delay = uv_nmi_loop_delay; + + for_each_cpu(j, uv_nmi_cpu_mask) { + if (atomic_read(&uv_cpu_nmi_per(j).state)) { + cpumask_clear_cpu(j, uv_nmi_cpu_mask); + if (++k >= n) + break; + } + } + if (k >= n) { /* all in? */ + k = n; + break; + } + if (last_k != k) { /* abort if no new cpus coming in */ + last_k = k; + waiting = 0; + } else if (++waiting > uv_nmi_wait_count) + break; + + /* extend delay if waiting only for cpu 0 */ + if (waiting && (n - k) == 1 && + cpumask_test_cpu(0, uv_nmi_cpu_mask)) + loop_delay *= 100; + + udelay(loop_delay); + } + atomic_set(&uv_nmi_cpus_in_nmi, k); + return n - k; +} + +/* Wait until all slave cpus have entered UV NMI handler */ +static void uv_nmi_wait(int master) +{ + /* indicate this cpu is in */ + atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_IN); + + /* if not the first cpu in (the master), then we are a slave cpu */ + if (!master) + return; + + do { + /* wait for all other cpus to gather here */ + if (!uv_nmi_wait_cpus(1)) + break; + + /* if not all made it in, send IPI NMI to them */ + uv_nmi_nr_cpus_pr(KERN_ALERT + "UV: Sending NMI IPI to %d non-responding CPUs: %s\n"); + uv_nmi_nr_cpus_ping(); + + /* if all cpus are in, then done */ + if (!uv_nmi_wait_cpus(0)) + break; + + uv_nmi_nr_cpus_pr(KERN_ALERT + "UV: %d CPUs not in NMI loop: %s\n"); + } while (0); + + pr_alert("UV: %d of %d CPUs in NMI\n", + atomic_read(&uv_nmi_cpus_in_nmi), num_online_cpus()); +} + +static void uv_nmi_dump_cpu_ip_hdr(void) +{ + printk(KERN_DEFAULT + "\nUV: %4s %6s %-32s %s (Note: PID 0 not listed)\n", + "CPU", "PID", "COMMAND", "IP"); +} + +static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs) +{ + printk(KERN_DEFAULT "UV: %4d %6d %-32.32s ", + cpu, current->pid, current->comm); + + printk_address(regs->ip); +} + +/* Dump this cpu's state */ +static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs) +{ + const char *dots = " ................................. "; + + if (uv_nmi_action_is("ips")) { + if (cpu == 0) + uv_nmi_dump_cpu_ip_hdr(); + + if (current->pid != 0) + uv_nmi_dump_cpu_ip(cpu, regs); + + } else if (uv_nmi_action_is("dump")) { + printk(KERN_DEFAULT + "UV:%sNMI process trace for CPU %d\n", dots, cpu); + show_regs(regs); + } + atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE); +} + +/* Trigger a slave cpu to dump it's state */ +static void uv_nmi_trigger_dump(int cpu) +{ + int retry = uv_nmi_trigger_delay; + + if (atomic_read(&uv_cpu_nmi_per(cpu).state) != UV_NMI_STATE_IN) + return; + + atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_DUMP); + do { + cpu_relax(); + udelay(10); + if (atomic_read(&uv_cpu_nmi_per(cpu).state) + != UV_NMI_STATE_DUMP) + return; + } while (--retry > 0); + + pr_crit("UV: CPU %d stuck in process dump function\n", cpu); + atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_DUMP_DONE); +} + +/* Wait until all cpus ready to exit */ +static void uv_nmi_sync_exit(int master) +{ + atomic_dec(&uv_nmi_cpus_in_nmi); + if (master) { + while (atomic_read(&uv_nmi_cpus_in_nmi) > 0) + cpu_relax(); + atomic_set(&uv_nmi_slave_continue, SLAVE_CLEAR); + } else { + while (atomic_read(&uv_nmi_slave_continue)) + cpu_relax(); + } +} + +/* Walk through cpu list and dump state of each */ +static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master) +{ + if (master) { + int tcpu; + int ignored = 0; + int saved_console_loglevel = console_loglevel; + + pr_alert("UV: tracing %s for %d CPUs from CPU %d\n", + uv_nmi_action_is("ips") ? "IPs" : "processes", + atomic_read(&uv_nmi_cpus_in_nmi), cpu); + + console_loglevel = uv_nmi_loglevel; + atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT); + for_each_online_cpu(tcpu) { + if (cpumask_test_cpu(tcpu, uv_nmi_cpu_mask)) + ignored++; + else if (tcpu == cpu) + uv_nmi_dump_state_cpu(tcpu, regs); + else + uv_nmi_trigger_dump(tcpu); + } + if (ignored) + printk(KERN_DEFAULT "UV: %d CPUs ignored NMI\n", + ignored); + + console_loglevel = saved_console_loglevel; + pr_alert("UV: process trace complete\n"); + } else { + while (!atomic_read(&uv_nmi_slave_continue)) + cpu_relax(); + while (atomic_read(&uv_cpu_nmi.state) != UV_NMI_STATE_DUMP) + cpu_relax(); + uv_nmi_dump_state_cpu(cpu, regs); + } + uv_nmi_sync_exit(master); +} + +static void uv_nmi_touch_watchdogs(void) +{ + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); + touch_nmi_watchdog(); +} + +#if defined(CONFIG_KEXEC) +static atomic_t uv_nmi_kexec_failed; +static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) +{ + /* Call crash to dump system state */ + if (master) { + pr_emerg("UV: NMI executing crash_kexec on CPU%d\n", cpu); + crash_kexec(regs); + + pr_emerg("UV: crash_kexec unexpectedly returned, "); + if (!kexec_crash_image) { + pr_cont("crash kernel not loaded\n"); + atomic_set(&uv_nmi_kexec_failed, 1); + uv_nmi_sync_exit(1); + return; + } + pr_cont("kexec busy, stalling cpus while waiting\n"); + } + + /* If crash exec fails the slaves should return, otherwise stall */ + while (atomic_read(&uv_nmi_kexec_failed) == 0) + mdelay(10); + + /* Crash kernel most likely not loaded, return in an orderly fashion */ + uv_nmi_sync_exit(0); +} + +#else /* !CONFIG_KEXEC */ +static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) +{ + if (master) + pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n"); +} +#endif /* !CONFIG_KEXEC */ + +#ifdef CONFIG_KGDB +#ifdef CONFIG_KGDB_KDB +static inline int uv_nmi_kdb_reason(void) +{ + return KDB_REASON_SYSTEM_NMI; +} +#else /* !CONFIG_KGDB_KDB */ +static inline int uv_nmi_kdb_reason(void) +{ + /* Insure user is expecting to attach gdb remote */ + if (uv_nmi_action_is("kgdb")) + return 0; + + pr_err("UV: NMI error: KDB is not enabled in this kernel\n"); + return -1; +} +#endif /* CONFIG_KGDB_KDB */ + +/* + * Call KGDB/KDB from NMI handler + * + * Note that if both KGDB and KDB are configured, then the action of 'kgdb' or + * 'kdb' has no affect on which is used. See the KGDB documention for further + * information. + */ +static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master) +{ + if (master) { + int reason = uv_nmi_kdb_reason(); + int ret; + + if (reason < 0) + return; + + /* call KGDB NMI handler as MASTER */ + ret = kgdb_nmicallin(cpu, X86_TRAP_NMI, regs, reason, + &uv_nmi_slave_continue); + if (ret) { + pr_alert("KGDB returned error, is kgdboc set?\n"); + atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT); + } + } else { + /* wait for KGDB signal that it's ready for slaves to enter */ + int sig; + + do { + cpu_relax(); + sig = atomic_read(&uv_nmi_slave_continue); + } while (!sig); + + /* call KGDB as slave */ + if (sig == SLAVE_CONTINUE) + kgdb_nmicallback(cpu, regs); + } + uv_nmi_sync_exit(master); +} + +#else /* !CONFIG_KGDB */ +static inline void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master) +{ + pr_err("UV: NMI error: KGDB is not enabled in this kernel\n"); +} +#endif /* !CONFIG_KGDB */ + +/* + * UV NMI handler + */ +int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) +{ + struct uv_hub_nmi_s *hub_nmi = uv_hub_nmi; + int cpu = smp_processor_id(); + int master = 0; + unsigned long flags; + + local_irq_save(flags); + + /* If not a UV System NMI, ignore */ + if (!atomic_read(&uv_cpu_nmi.pinging) && !uv_check_nmi(hub_nmi)) { + local_irq_restore(flags); + return NMI_DONE; + } + + /* Indicate we are the first CPU into the NMI handler */ + master = (atomic_read(&uv_nmi_cpu) == cpu); + + /* If NMI action is "kdump", then attempt to do it */ + if (uv_nmi_action_is("kdump")) + uv_nmi_kdump(cpu, master, regs); + + /* Pause as all cpus enter the NMI handler */ + uv_nmi_wait(master); + + /* Dump state of each cpu */ + if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) + uv_nmi_dump_state(cpu, regs, master); + + /* Call KGDB/KDB if enabled */ + else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb")) + uv_call_kgdb_kdb(cpu, regs, master); + + /* Clear per_cpu "in nmi" flag */ + atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_OUT); + + /* Clear MMR NMI flag on each hub */ + uv_clear_nmi(cpu); + + /* Clear global flags */ + if (master) { + if (cpumask_weight(uv_nmi_cpu_mask)) + uv_nmi_cleanup_mask(); + atomic_set(&uv_nmi_cpus_in_nmi, -1); + atomic_set(&uv_nmi_cpu, -1); + atomic_set(&uv_in_nmi, 0); + } + + uv_nmi_touch_watchdogs(); + local_irq_restore(flags); + + return NMI_HANDLED; +} + +/* + * NMI handler for pulling in CPUs when perf events are grabbing our NMI + */ +static int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs) +{ + int ret; + + uv_cpu_nmi.queries++; + if (!atomic_read(&uv_cpu_nmi.pinging)) { + local64_inc(&uv_nmi_ping_misses); + return NMI_DONE; + } + + uv_cpu_nmi.pings++; + local64_inc(&uv_nmi_ping_count); + ret = uv_handle_nmi(reason, regs); + atomic_set(&uv_cpu_nmi.pinging, 0); + return ret; +} + +static void uv_register_nmi_notifier(void) +{ + if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv")) + pr_warn("UV: NMI handler failed to register\n"); + + if (register_nmi_handler(NMI_LOCAL, uv_handle_nmi_ping, 0, "uvping")) + pr_warn("UV: PING NMI handler failed to register\n"); +} + +void uv_nmi_init(void) +{ + unsigned int value; + + /* + * Unmask NMI on all cpus + */ + value = apic_read(APIC_LVT1) | APIC_DM_NMI; + value &= ~APIC_LVT_MASKED; + apic_write(APIC_LVT1, value); +} + +void uv_nmi_setup(void) +{ + int size = sizeof(void *) * (1 << NODES_SHIFT); + int cpu, nid; + + /* Setup hub nmi info */ + uv_nmi_setup_mmrs(); + uv_hub_nmi_list = kzalloc(size, GFP_KERNEL); + pr_info("UV: NMI hub list @ 0x%p (%d)\n", uv_hub_nmi_list, size); + BUG_ON(!uv_hub_nmi_list); + size = sizeof(struct uv_hub_nmi_s); + for_each_present_cpu(cpu) { + nid = cpu_to_node(cpu); + if (uv_hub_nmi_list[nid] == NULL) { + uv_hub_nmi_list[nid] = kzalloc_node(size, + GFP_KERNEL, nid); + BUG_ON(!uv_hub_nmi_list[nid]); + raw_spin_lock_init(&(uv_hub_nmi_list[nid]->nmi_lock)); + atomic_set(&uv_hub_nmi_list[nid]->cpu_owner, -1); + } + uv_hub_nmi_per(cpu) = uv_hub_nmi_list[nid]; + } + BUG_ON(!alloc_cpumask_var(&uv_nmi_cpu_mask, GFP_KERNEL)); + uv_register_nmi_notifier(); +} diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 9f29a01ee1b..5c86786bbfd 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -15,7 +15,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009-2013 Silicon Graphics, Inc. All Rights Reserved. * Copyright (c) Dimitri Sivanich */ #include <linux/clockchips.h> @@ -37,7 +37,7 @@ static void uv_rtc_timer_setup(enum clock_event_mode, static struct clocksource clocksource_uv = { .name = RTC_NAME, - .rating = 400, + .rating = 299, .read = uv_read_rtc, .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, .flags = CLOCK_SOURCE_IS_CONTINUOUS, @@ -102,9 +102,10 @@ static int uv_intr_pending(int pnode) if (is_uv1_hub()) return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & UV1H_EVENT_OCCURRED0_RTC1_MASK; - else - return uv_read_global_mmr64(pnode, UV2H_EVENT_OCCURRED2) & - UV2H_EVENT_OCCURRED2_RTC_1_MASK; + else if (is_uvx_hub()) + return uv_read_global_mmr64(pnode, UVXH_EVENT_OCCURRED2) & + UVXH_EVENT_OCCURRED2_RTC_1_MASK; + return 0; } /* Setup interrupt and return non-zero if early expiration occurred. */ @@ -122,8 +123,8 @@ static int uv_setup_intr(int cpu, u64 expires) uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, UV1H_EVENT_OCCURRED0_RTC1_MASK); else - uv_write_global_mmr64(pnode, UV2H_EVENT_OCCURRED2_ALIAS, - UV2H_EVENT_OCCURRED2_RTC_1_MASK); + uv_write_global_mmr64(pnode, UVXH_EVENT_OCCURRED2_ALIAS, + UVXH_EVENT_OCCURRED2_RTC_1_MASK); val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); @@ -158,10 +159,9 @@ static __init int uv_rtc_allocate_timers(void) { int cpu; - blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL); + blade_info = kzalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL); if (!blade_info) return -ENOMEM; - memset(blade_info, 0, uv_possible_blades * sizeof(void *)); for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); @@ -379,10 +379,6 @@ static __init int uv_rtc_setup_clock(void) if (!is_uv_system()) return -ENODEV; - /* If single blade, prefer tsc */ - if (uv_num_possible_blades() == 1) - clocksource_uv.rating = 250; - rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second); if (rc) printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); |
