diff options
author | Cliff Wickman <cpw@sgi.com> | 2010-04-14 11:35:46 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-04-14 18:49:53 +0200 |
commit | b8f7fb13d2d7ff14818fd1d3edd8b834d38b0217 (patch) | |
tree | 48844c12cc443690116abbec7e836f8c08360d56 /arch/x86/kernel/tlb_uv.c | |
parent | 2acebe9ecb2b77876e87a1480729cfb2db4570dd (diff) |
x86, UV: Improve BAU performance and error recovery
- increase performance of the interrupt handler
- release timed-out software acknowledge resources
- recover from continuous-busy status due to a hardware issue
- add a 'throttle' to keep a uvhub from sending more than a
specified number of broadcasts concurrently (work around the hardware issue)
- provide a 'nobau' boot command line option
- rename 'pnode' and 'node' to 'uvhub' (the 'node' terminology
is ambiguous)
- add some new statistics about the scope of broadcasts, retries, the
hardware issue and the 'throttle'
- split off new function uv_bau_retry_msg() from
uv_bau_process_message() per community coding style feedback.
- simplify the argument list to uv_bau_process_message(), per
community coding style feedback.
Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: linux-mm@kvack.org
Cc: Jack Steiner <steiner@sgi.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Mike Travis <travis@sgi.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <E1O25Z4-0004Ur-PB@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/kernel/tlb_uv.c')
-rw-r--r-- | arch/x86/kernel/tlb_uv.c | 1270 |
1 files changed, 900 insertions, 370 deletions
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index ef68ba48564..414f7c4fe76 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -1,7 +1,7 @@ /* * SGI UltraViolet TLB flush routines. * - * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI. + * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI. * * This code is released under the GNU General Public License version 2 or * later. @@ -19,44 +19,67 @@ #include <asm/idle.h> #include <asm/tsc.h> #include <asm/irq_vectors.h> +#include <asm/timer.h> + +struct msg_desc { + struct bau_payload_queue_entry *msg; + int msg_slot; + int sw_ack_slot; + struct bau_payload_queue_entry *va_queue_first; + struct bau_payload_queue_entry *va_queue_last; +}; #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL -static struct bau_control **uv_bau_table_bases __read_mostly; -static int uv_bau_retry_limit __read_mostly; +static int uv_bau_max_concurrent __read_mostly; -/* base pnode in this partition */ -static int uv_partition_base_pnode __read_mostly; +static int nobau; +static int __init setup_nobau(char *arg) +{ + nobau = 1; + return 0; +} +early_param("nobau", setup_nobau); -static unsigned long uv_mmask __read_mostly; +/* base pnode in this partition */ +static int uv_partition_base_pnode __read_mostly; +/* position of pnode (which is nasid>>1): */ +static int uv_nshift __read_mostly; +static unsigned long uv_mmask __read_mostly; static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); +static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); + +struct reset_args { + int sender; +}; /* - * Determine the first node on a blade. + * Determine the first node on a uvhub. 'Nodes' are used for kernel + * memory allocation. */ -static int __init blade_to_first_node(int blade) +static int __init uvhub_to_first_node(int uvhub) { int node, b; for_each_online_node(node) { b = uv_node_to_blade_id(node); - if (blade == b) + if (uvhub == b) return node; } - return -1; /* shouldn't happen */ + return -1; } /* - * Determine the apicid of the first cpu on a blade. + * Determine the apicid of the first cpu on a uvhub. */ -static int __init blade_to_first_apicid(int blade) +static int __init uvhub_to_first_apicid(int uvhub) { int cpu; for_each_present_cpu(cpu) - if (blade == uv_cpu_to_blade_id(cpu)) + if (uvhub == uv_cpu_to_blade_id(cpu)) return per_cpu(x86_cpu_to_apicid, cpu); return -1; } @@ -69,195 +92,459 @@ static int __init blade_to_first_apicid(int blade) * clear of the Timeout bit (as well) will free the resource. No reply will * be sent (the hardware will only do one reply per message). */ -static void uv_reply_to_message(int resource, - struct bau_payload_queue_entry *msg, - struct bau_msg_status *msp) +static inline void uv_reply_to_message(struct msg_desc *mdp, + struct bau_control *bcp) { unsigned long dw; + struct bau_payload_queue_entry *msg; - dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); + msg = mdp->msg; + if (!msg->canceled) { + dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) | + msg->sw_ack_vector; + uv_write_local_mmr( + UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); + } msg->replied_to = 1; msg->sw_ack_vector = 0; - if (msp) - msp->seen_by.bits = 0; - uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); } /* - * Do all the things a cpu should do for a TLB shootdown message. - * Other cpu's may come here at the same time for this message. + * Process the receipt of a RETRY message */ -static void uv_bau_process_message(struct bau_payload_queue_entry *msg, - int msg_slot, int sw_ack_slot) +static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, + struct bau_control *bcp) { - unsigned long this_cpu_mask; - struct bau_msg_status *msp; - int cpu; + int i; + int cancel_count = 0; + int slot2; + unsigned long msg_res; + unsigned long mmr = 0; + struct bau_payload_queue_entry *msg; + struct bau_payload_queue_entry *msg2; + struct ptc_stats *stat; - msp = __get_cpu_var(bau_control).msg_statuses + msg_slot; - cpu = uv_blade_processor_id(); - msg->number_of_cpus = - uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id())); - this_cpu_mask = 1UL << cpu; - if (msp->seen_by.bits & this_cpu_mask) - return; - atomic_or_long(&msp->seen_by.bits, this_cpu_mask); + msg = mdp->msg; + stat = &per_cpu(ptcstats, bcp->cpu); + stat->d_retries++; + /* + * cancel any message from msg+1 to the retry itself + */ + for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) { + if (msg2 > mdp->va_queue_last) + msg2 = mdp->va_queue_first; + if (msg2 == msg) + break; + + /* same conditions for cancellation as uv_do_reset */ + if ((msg2->replied_to == 0) && (msg2->canceled == 0) && + (msg2->sw_ack_vector) && ((msg2->sw_ack_vector & + msg->sw_ack_vector) == 0) && + (msg2->sending_cpu == msg->sending_cpu) && + (msg2->msg_type != MSG_NOOP)) { + slot2 = msg2 - mdp->va_queue_first; + mmr = uv_read_local_mmr + (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); + msg_res = ((msg2->sw_ack_vector << 8) | + msg2->sw_ack_vector); + /* + * This is a message retry; clear the resources held + * by the previous message only if they timed out. + * If it has not timed out we have an unexpected + * situation to report. + */ + if (mmr & (msg_res << 8)) { + /* + * is the resource timed out? + * make everyone ignore the cancelled message. + */ + msg2->canceled = 1; + stat->d_canceled++; + cancel_count++; + uv_write_local_mmr( + UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, + (msg_res << 8) | msg_res); + } else + printk(KERN_INFO "note bau retry: no effect\n"); + } + } + if (!cancel_count) + stat->d_nocanceled++; +} - if (msg->replied_to == 1) - return; +/* + * Do all the things a cpu should do for a TLB shootdown message. + * Other cpu's may come here at the same time for this message. + */ +static void uv_bau_process_message(struct msg_desc *mdp, + struct bau_control *bcp) +{ + int msg_ack_count; + short socket_ack_count = 0; + struct ptc_stats *stat; + struct bau_payload_queue_entry *msg; + struct bau_control *smaster = bcp->socket_master; + /* + * This must be a normal message, or retry of a normal message + */ + msg = mdp->msg; + stat = &per_cpu(ptcstats, bcp->cpu); if (msg->address == TLB_FLUSH_ALL) { local_flush_tlb(); - __get_cpu_var(ptcstats).alltlb++; + stat->d_alltlb++; } else { __flush_tlb_one(msg->address); - __get_cpu_var(ptcstats).onetlb++; + stat->d_onetlb++; } + stat->d_requestee++; + + /* + * One cpu on each uvhub has the additional job on a RETRY + * of releasing the resource held by the message that is + * being retried. That message is identified by sending + * cpu number. + */ + if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master) + uv_bau_process_retry_msg(mdp, bcp); - __get_cpu_var(ptcstats).requestee++; + /* + * This is a sw_ack message, so we have to reply to it. + * Count each responding cpu on the socket. This avoids + * pinging the count's cache line back and forth between + * the sockets. + */ + socket_ack_count = atomic_add_short_return(1, (struct atomic_short *) + &smaster->socket_acknowledge_count[mdp->msg_slot]); + if (socket_ack_count == bcp->cpus_in_socket) { + /* + * Both sockets dump their completed count total into + * the message's count. + */ + smaster->socket_acknowledge_count[mdp->msg_slot] = 0; + msg_ack_count = atomic_add_short_return(socket_ack_count, + (struct atomic_short *)&msg->acknowledge_count); + + if (msg_ack_count == bcp->cpus_in_uvhub) { + /* + * All cpus in uvhub saw it; reply + */ + uv_reply_to_message(mdp, bcp); + } + } - atomic_inc_short(&msg->acknowledge_count); - if (msg->number_of_cpus == msg->acknowledge_count) - uv_reply_to_message(sw_ack_slot, msg, msp); + return; } /* - * Examine the payload queue on one distribution node to see - * which messages have not been seen, and which cpu(s) have not seen them. + * Determine the first cpu on a uvhub. + */ +static int uvhub_to_first_cpu(int uvhub) +{ + int cpu; + for_each_present_cpu(cpu) + if (uvhub == uv_cpu_to_blade_id(cpu)) + return cpu; + return -1; +} + +/* + * Last resort when we get a large number of destination timeouts is + * to clear resources held by a given cpu. + * Do this with IPI so that all messages in the BAU message queue + * can be identified by their nonzero sw_ack_vector field. * - * Returns the number of cpu's that have not responded. + * This is entered for a single cpu on the uvhub. + * The sender want's this uvhub to free a specific message's + * sw_ack resources. */ -static int uv_examine_destination(struct bau_control *bau_tablesp, int sender) +static void +uv_do_reset(void *ptr) { - struct bau_payload_queue_entry *msg; - struct bau_msg_status *msp; - int count = 0; int i; - int j; + int slot; + int count = 0; + unsigned long mmr; + unsigned long msg_res; + struct bau_control *bcp; + struct reset_args *rap; + struct bau_payload_queue_entry *msg; + struct ptc_stats *stat; - for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE; - msg++, i++) { - if ((msg->sending_cpu == sender) && (!msg->replied_to)) { - msp = bau_tablesp->msg_statuses + i; - printk(KERN_DEBUG - "blade %d: address:%#lx %d of %d, not cpu(s): ", - i, msg->address, msg->acknowledge_count, - msg->number_of_cpus); - for (j = 0; j < msg->number_of_cpus; j++) { - if (!((1L << j) & msp->seen_by.bits)) { - count++; - printk("%d ", j); - } + bcp = &per_cpu(bau_control, smp_processor_id()); + rap = (struct reset_args *)ptr; + stat = &per_cpu(ptcstats, bcp->cpu); + stat->d_resets++; + + /* + * We're looking for the given sender, and + * will free its sw_ack resource. + * If all cpu's finally responded after the timeout, its + * message 'replied_to' was set. + */ + for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { + /* uv_do_reset: same conditions for cancellation as + uv_bau_process_retry_msg() */ + if ((msg->replied_to == 0) && + (msg->canceled == 0) && + (msg->sending_cpu == rap->sender) && + (msg->sw_ack_vector) && + (msg->msg_type != MSG_NOOP)) { + /* + * make everyone else ignore this message + */ + msg->canceled = 1; + slot = msg - bcp->va_queue_first; + count++; + /* + * only reset the resource if it is still pending + */ + mmr = uv_read_local_mmr + (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); + msg_res = ((msg->sw_ack_vector << 8) | + msg->sw_ack_vector); + if (mmr & msg_res) { + stat->d_rcanceled++; + uv_write_local_mmr( + UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, + msg_res); } - printk("\n"); } } - return count; + return; } /* - * Examine the payload queue on all the distribution nodes to see - * which messages have not been seen, and which cpu(s) have not seen them. - * - * Returns the number of cpu's that have not responded. + * Use IPI to get all target uvhubs to release resources held by + * a given sending cpu number. */ -static int uv_examine_destinations(struct bau_target_nodemask *distribution) +static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution, + int sender) { - int sender; - int i; - int count = 0; + int uvhub; + int cpu; + cpumask_t mask; + struct reset_args reset_args; - sender = smp_processor_id(); - for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) { - if (!bau_node_isset(i, distribution)) + reset_args.sender = sender; + + cpus_clear(mask); + /* find a single cpu for each uvhub in this distribution mask */ + for (uvhub = 0; + uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE; + uvhub++) { + if (!bau_uvhub_isset(uvhub, distribution)) continue; - count += uv_examine_destination(uv_bau_table_bases[i], sender); + /* find a cpu for this uvhub */ + cpu = uvhub_to_first_cpu(uvhub); + cpu_set(cpu, mask); } - return count; + /* IPI all cpus; Preemption is already disabled */ + smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1); + return; +} + +static inline unsigned long +cycles_2_us(unsigned long long cyc) +{ + unsigned long long ns; + unsigned long us; + ns = (cyc * per_cpu(cyc2ns, smp_processor_id())) + >> CYC2NS_SCALE_FACTOR; + us = ns / 1000; + return us; } /* - * wait for completion of a broadcast message - * - * return COMPLETE, RETRY or GIVEUP + * wait for all cpus on this hub to finish their sends and go quiet + * leaves uvhub_quiesce set so that no new broadcasts are started by + * bau_flush_send_and_wait() + */ +static inline void +quiesce_local_uvhub(struct bau_control *hmaster) +{ + atomic_add_short_return(1, (struct atomic_short *) + &hmaster->uvhub_quiesce); +} + +/* + * mark this quiet-requestor as done + */ +static inline void +end_uvhub_quiesce(struct bau_control *hmaster) +{ + atomic_add_short_return(-1, (struct atomic_short *) + &hmaster->uvhub_quiesce); +} + +/* + * Wait for completion of a broadcast software ack message + * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP */ static int uv_wait_completion(struct bau_desc *bau_desc, - unsigned long mmr_offset, int right_shift) + unsigned long mmr_offset, int right_shift, int this_cpu, + struct bau_control *bcp, struct bau_control *smaster, long try) { - int exams = 0; - long destination_timeouts = 0; - long source_timeouts = 0; + int relaxes = 0; unsigned long descriptor_status; + unsigned long mmr; + unsigned long mask; + cycles_t ttime; + cycles_t timeout_time; + struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu); + struct bau_control *hmaster; + hmaster = bcp->uvhub_master; + timeout_time = get_cycles() + bcp->timeout_interval; + + /* spin on the status MMR, waiting for it to go idle */ while ((descriptor_status = (((unsigned long) uv_read_local_mmr(mmr_offset) >> right_shift) & UV_ACT_STATUS_MASK)) != DESC_STATUS_IDLE) { - if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { - source_timeouts++; - if (source_timeouts > SOURCE_TIMEOUT_LIMIT) - source_timeouts = 0; - __get_cpu_var(ptcstats).s_retry++; - return FLUSH_RETRY; - } /* - * spin here looking for progress at the destinations + * Our software ack messages may be blocked because there are + * no swack resources available. As long as none of them + * has timed out hardware will NACK our message and its + * state will stay IDLE. */ - if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { - destination_timeouts++; - if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { - /* - * returns number of cpus not responding - */ - if (uv_examine_destinations - (&bau_desc->distribution) == 0) { - __get_cpu_var(ptcstats).d_retry++; - return FLUSH_RETRY; - } - exams++; - if (exams >= uv_bau_retry_limit) { - printk(KERN_DEBUG - "uv_flush_tlb_others"); - printk("giving up on cpu %d\n", - smp_processor_id()); + if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { + stat->s_stimeout++; + return FLUSH_GIVEUP; + } else if (descriptor_status == + DESC_STATUS_DESTINATION_TIMEOUT) { + stat->s_dtimeout++; + ttime = get_cycles(); + + /* + * Our retries may be blocked by all destination + * swack resources being consumed, and a timeout + * pending. In that case hardware returns the + * ERROR that looks like a destination timeout. + */ + if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) { + bcp->conseccompletes = 0; + return FLUSH_RETRY_PLUGGED; + } + + bcp->conseccompletes = 0; + return FLUSH_RETRY_TIMEOUT; + } else { + /* + * descriptor_status is still BUSY + */ + cpu_relax(); + relaxes++; + if (relaxes >= 10000) { + relaxes = 0; + if (get_cycles() > timeout_time) { + quiesce_local_uvhub(hmaster); + + /* single-thread the register change */ + spin_lock(&hmaster->masks_lock); + mmr = uv_read_local_mmr(mmr_offset); + mask = 0UL; + mask |= (3UL < right_shift); + mask = ~mask; + mmr &= mask; + uv_write_local_mmr(mmr_offset, mmr); + spin_unlock(&hmaster->masks_lock); + end_uvhub_quiesce(hmaster); + stat->s_busy++; return FLUSH_GIVEUP; } - /* - * delays can hang the simulator - udelay(1000); - */ - destination_timeouts = 0; } } - cpu_relax(); } + bcp->conseccompletes++; return FLUSH_COMPLETE; } +static inline cycles_t +sec_2_cycles(unsigned long sec) +{ + unsigned long ns; + cycles_t cyc; + + ns = sec * 1000000000; + cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); + return cyc; +} + +/* + * conditionally add 1 to *v, unless *v is >= u + * return 0 if we cannot add 1 to *v because it is >= u + * return 1 if we can add 1 to *v because it is < u + * the add is atomic + * + * This is close to atomic_add_unless(), but this allows the 'u' value + * to be lowered below the current 'v'. atomic_add_unless can only stop + * on equal. + */ +static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) +{ + spin_lock(lock); + if (atomic_read(v) >= u) { + spin_unlock(lock); + return 0; + } + atomic_inc(v); + spin_unlock(lock); + return 1; +} + /** * uv_flush_send_and_wait * - * Send a broadcast and wait for a broadcast message to complete. + * Send a broadcast and wait for it to complete. * - * The flush_mask contains the cpus the broadcast was sent to. + * The flush_mask contains the cpus the broadcast is to be sent to, plus + * cpus that are on the local uvhub. * - * Returns NULL if all remote flushing was done. The mask is zeroed. + * Returns NULL if all flushing represented in the mask was done. The mask + * is zeroed. * Returns @flush_mask if some remote flushing remains to be done. The - * mask will have some bits still set. + * mask will have some bits still set, representing any cpus on the local + * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed. */ -const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, - struct bau_desc *bau_desc, - struct cpumask *flush_mask) +const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, + struct cpumask *flush_mask, + struct bau_control *bcp) { - int completion_status = 0; int right_shift; - int tries = 0; - int pnode; + int uvhub; int bit; + int completion_status = 0; + int seq_number = 0; + long try = 0; + int cpu = bcp->uvhub_cpu; + int this_cpu = bcp->cpu; + int this_uvhub = bcp->uvhub; unsigned long mmr_offset; unsigned long index; cycles_t time1; cycles_t time2; + struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); + struct bau_control *smaster = bcp->socket_master; + struct bau_control *hmaster = bcp->uvhub_master; + + /* + * Spin here while there are hmaster->max_concurrent or more active + * descriptors. This is the per-uvhub 'throttle'. + */ + if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, + &hmaster->active_descriptor_count, + hmaster->max_concurrent)) { + stat->s_throttles++; + do { + cpu_relax(); + } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, + &hmaster->active_descriptor_count, + hmaster->max_concurrent)); + } + + while (hmaster->uvhub_quiesce) + cpu_relax(); if (cpu < UV_CPUS_PER_ACT_STATUS) { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; @@ -269,24 +556,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, } time1 = get_cycles(); do { - tries++; + /* + * Every message from any given cpu gets a unique message + * sequence number. But retries use that same number. + * Our message may have timed out at the destination because + * all sw-ack resources are in use and there is a timeout + * pending there. In that case, our last send never got + * placed into the queue and we need to persist until it + * does. + * + * Make any retry a type MSG_RETRY so that the destination will + * free any resource held by a previous message from this cpu. + */ + if (try == 0) { + /* use message type set by the caller the first time */ + seq_number = bcp->message_number++; + } else { + /* use RETRY type on all the rest; same sequence */ + bau_desc->header.msg_type = MSG_RETRY; + stat->s_retry_messages++; + } + bau_desc->header.sequence = seq_number; index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | - cpu; + bcp->uvhub_cpu; + bcp->send_message = get_cycles(); + uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); + + try++; completion_status = uv_wait_completion(bau_desc, mmr_offset, - right_shift); - } while (completion_status == FLUSH_RETRY); + right_shift, this_cpu, bcp, smaster, try); + + if (completion_status == FLUSH_RETRY_PLUGGED) { + /* + * Our retries may be blocked by all destination swack + * resources being consumed, and a timeout pending. In + * that case hardware immediately returns the ERROR + * that looks like a destination timeout. + */ + udelay(TIMEOUT_DELAY); + bcp->plugged_tries++; + if (bcp->plugged_tries >= PLUGSB4RESET) { + bcp->plugged_tries = 0; + quiesce_local_uvhub(hmaster); + spin_lock(&hmaster->queue_lock); + uv_reset_with_ipi(&bau_desc->distribution, + this_cpu); + spin_unlock(&hmaster->queue_lock); + end_uvhub_quiesce(hmaster); + bcp->ipi_attempts++; + stat->s_resets_plug++; + } + } else if (completion_status == FLUSH_RETRY_TIMEOUT) { + hmaster->max_concurrent = 1; + bcp->timeout_tries++; + udelay(TIMEOUT_DELAY); + if (bcp->timeout_tries >= TIMEOUTSB4RESET) { + bcp->timeout_tries = 0; + quiesce_local_uvhub(hmaster); + spin_lock(&hmaster->queue_lock); + uv_reset_with_ipi(&bau_desc->distribution, + this_cpu); + spin_unlock(&hmaster->queue_lock); + end_uvhub_quiesce(hmaster); + bcp->ipi_attempts++; + stat->s_resets_timeout++; + } + } + if (bcp->ipi_attempts >= 3) { + bcp->ipi_attempts = 0; + completion_status = FLUSH_GIVEUP; + break; + } + cpu_relax(); + } while ((completion_status == FLUSH_RETRY_PLUGGED) || + (completion_status == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); - __get_cpu_var(ptcstats).sflush += (time2 - time1); - if (tries > 1) - __get_cpu_var(ptcstats).retriesok++; - if (completion_status == FLUSH_GIVEUP) { + if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5) + && (hmaster->max_concurrent < hmaster->max_concurrent_constant)) + hmaster->max_concurrent++; + + /* + * hold any cpu not timing out here; no other cpu currently held by + * the 'throttle' should enter the activation code + */ + while (hmaster->uvhub_quiesce) + cpu_relax(); + atomic_dec(&hmaster->active_descriptor_count); + + /* guard against cycles wrap */ + if (time2 > time1) + stat->s_time += (time2 - time1); + else + stat->s_requestor--; /* don't count this one */ + if (completion_status == FLUSH_COMPLETE && try > 1) + stat->s_retriesok++; + else if (completion_status == FLUSH_GIVEUP) { /* * Cause the caller to do an IPI-style TLB shootdown on - * the cpu's, all of which are still in the mask. + * the target cpu's, all of which are still in the mask. */ - __get_cpu_var(ptcstats).ptc_i++; + stat->s_giveup++; return flush_mask; } @@ -295,18 +666,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, * use the IPI method of shootdown on them. */ for_each_cpu(bit, flush_mask) { - pnode = uv_cpu_to_pnode(bit); - if (pnode == this_pnode) + uvhub = uv_cpu_to_blade_id(bit); + if (uvhub == this_uvhub) continue; cpumask_clear_cpu(bit, flush_mask); } if (!cpumask_empty(flush_mask)) return flush_mask; + return NULL; } -static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); - /** * uv_flush_tlb_others - globally purge translation cache of a virtual * address or all TLB's @@ -323,8 +693,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); * The caller has derived the cpumask from the mm_struct. This function * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) * - * The cpumask is converted into a nodemask of the nodes containing - * the cpus. + * The cpumask is converted into a uvhubmask of the uvhubs containing + * those cpus. * * Note that this function should be called with preemption disabled. * @@ -336,52 +706,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long va, unsigned int cpu) { - struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask); - int i; - int bit; - int pnode; - int uv_cpu; - int this_pnode; + int remotes; + int tcpu; + int uvhub; int locals = 0; struct bau_desc *bau_desc; + struct cpumask *flush_mask; + struct ptc_stats *stat; + struct bau_control *bcp; - cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); + if (nobau) + return cpumask; - uv_cpu = uv_blade_processor_id(); - this_pnode = uv_hub_info->pnode; - bau_desc = __get_cpu_var(bau_control).descriptor_base; - bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu; + bcp = &per_cpu(bau_control, cpu); + /* + * Each sending cpu has a per-cpu mask which it fills from the caller's + * cpu mask. Only remote cpus are converted to uvhubs and copied. + */ + flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); + /* + * copy cpumask to flush_mask, removing current cpu + * (current cpu should already have been flushed by the caller and + * should never be returned if we return flush_mask) + */ + cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); + if (cpu_isset(cpu, *cpumask)) + locals++; /* current cpu was targeted */ - bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); + bau_desc = bcp->descriptor_base; + bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; - i = 0; - for_each_cpu(bit, flush_mask) { - pnode = uv_cpu_to_pnode(bit); - BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1)); - if (pnode == this_pnode) { + bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); + remotes = 0; + for_each_cpu(tcpu, flush_mask) { + uvhub = uv_cpu_to_blade_id(tcpu); + if (uvhub == bcp->uvhub) { locals++; continue; } - bau_node_set(pnode - uv_partition_base_pnode, - &bau_desc->distribution); - i++; + bau_uvhub_set(uvhub, &bau_desc->distribution); + remotes++; } - if (i == 0) { + if (remotes == 0) { /* - * no off_node flushing; return status for local node + * No off_hub flushing; return status for local hub. + * Return the caller's mask if all were local (the current + * cpu may be in that mask). */ if (locals) - return flush_mask; + return cpumask; else return NULL; } - __get_cpu_var(ptcstats).requestor++; - __get_cpu_var(ptcstats).ntargeted += i; + stat = &per_cpu(ptcstats, cpu); + stat->s_requestor++; + stat->s_ntargcpu += remotes; + remotes = bau_uvhub_weight(&bau_desc->distribution); + stat->s_ntarguvhub += remotes; + if (remotes >= 16) + stat->s_ntarguvhub16++; + else if (remotes >= 8) + stat->s_ntarguvhub8++; + else if (remotes >= 4) + stat->s_ntarguvhub4++; + else if (remotes >= 2) + stat->s_ntarguvhub2++; + else + stat->s_ntarguvhub1++; bau_desc->payload.address = va; bau_desc->payload.sending_cpu = cpu; - return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask); + /* + * uv_flush_send_and_wait returns null if all cpu's were messaged, or + * the adjusted flush_mask if any cpu's were not messaged. + */ + return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); } /* @@ -390,87 +790,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, * * We received a broadcast assist message. * - * Interrupts may have been disabled; this interrupt could represent + * Interrupts are disabled; this interrupt could represent * the receipt of several messages. * - * All cores/threads on this node get this interrupt. - * The last one to see it does the s/w ack. + * All cores/threads on this hub get this interrupt. + * The last one to see it does the software ack. * (the resource will not be freed until noninterruptable cpus see this - * interrupt; hardware will timeout the s/w ack and reply ERROR) + * interrupt; hardware may timeout the s/w ack and reply ERROR) */ void uv_bau_message_interrupt(struct pt_regs *regs) { - struct bau_payload_queue_entry *va_queue_first; - struct bau_payload_queue_entry *va_queue_last; - struct bau_payload_queue_entry *msg; - struct pt_regs *old_regs = set_irq_regs(regs); - cycles_t time1; - cycles_t time2; - int msg_slot; - int sw_ack_slot; - int fw; int count = 0; - unsigned long local_pnode; - - ack_APIC_irq(); - exit_idle(); - irq_enter(); - - time1 = get_cycles(); - - local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); - - va_queue_first = __get_cpu_var(bau_control).va_queue_first; - va_queue_last = __get_cpu_var(bau_control).va_queue_last; - - msg = __get_cpu_var(bau_control).bau_msg_head; + cycles_t time_start; + struct bau_payload_queue_entry *msg; + struct bau_control *bcp; + struct ptc_stats *stat; + struct msg_desc msgdesc; + + time_start = get_cycles(); + bcp = &per_cpu(bau_control, smp_processor_id()); + stat = &per_cpu(ptcstats, smp_processor_id()); + msgdesc.va_queue_first = bcp->va_queue_first; + msgdesc.va_queue_last = bcp->va_queue_last; + msg = bcp->bau_msg_head; while (msg->sw_ack_vector) { count++; - fw = msg->sw_ack_vector; - msg_slot = msg - va_queue_first; - sw_ack_slot = ffs(fw) - 1; - - uv_bau_process_message(msg, msg_slot, sw_ack_slot); - + msgdesc.msg_slot = msg - msgdesc.va_queue_first; + msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1; + msgdesc.msg = msg; + uv_bau_process_message(&msgdesc, bcp); msg++; - if (msg > va_queue_last) - msg = va_queue_first; - __get_cpu_var(bau_control).bau_msg_head = msg; + if (msg > msgdesc.va_queue_last) + msg = msgdesc.va_queue_first; + bcp->bau_msg_head = msg; } + stat->d_time += (get_cycles() - time_start); if (!count) - __get_cpu_var(ptcstats).nomsg++; + stat->d_nomsg++; else if (count > 1) - __get_cpu_var(ptcstats).multmsg++; - - time2 = get_cycles(); - __get_cpu_var(ptcstats).dflush += (time2 - time1); - - irq_exit(); - set_irq_regs(old_regs); + stat->d_multmsg++; + ack_APIC_irq(); } /* * uv_enable_timeouts * - * Each target blade (i.e. blades that have cpu's) needs to have + * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have * shootdown message timeouts enabled. The timeout does not cause * an interrupt, but causes an error message to be returned to * the sender. */ static void uv_enable_timeouts(void) { - int blade; - int nblades; + int uvhub; + int nuvhubs; int pnode; unsigned long mmr_image; - nblades = uv_num_possible_blades(); + nuvhubs = uv_num_possible_blades(); - for (blade = 0; blade < nblades; blade++) { - if (!uv_blade_nr_possible_cpus(blade)) + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { + if (!uv_blade_nr_possible_cpus(uvhub)) continue; - pnode = uv_blade_to_pnode(blade); + pnode = uv_blade_to_pnode(uvhub); mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); /* @@ -523,9 +906,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data) { } +static inline unsigned long long +millisec_2_cycles(unsigned long millisec) +{ + unsigned long ns; + unsigned long long cyc; + + ns = millisec * 1000; + cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); + return cyc; +} + /* - * Display the statistics thru /proc - * data points to the cpu number + * Display the statistics thru /proc. + * 'data' points to the cpu number */ static int uv_ptc_seq_show(struct seq_file *file, void *data) { @@ -536,78 +930,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) if (!cpu) { seq_printf(file, - "# cpu requestor requestee one all sretry dretry ptc_i "); + "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "sw_ack sflush dflush sok dnomsg dmult starget\n"); + "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); + seq_printf(file, + "retries rok resetp resett giveup sto bz throt "); + seq_printf(file, + "sw_ack recv rtime all "); + seq_printf(file, + "one mult none retry canc nocan reset rcan\n"); } |