aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel/tlb_uv.c
diff options
context:
space:
mode:
authorCliff Wickman <cpw@sgi.com>2010-04-14 11:35:46 -0500
committerIngo Molnar <mingo@elte.hu>2010-04-14 18:49:53 +0200
commitb8f7fb13d2d7ff14818fd1d3edd8b834d38b0217 (patch)
tree48844c12cc443690116abbec7e836f8c08360d56 /arch/x86/kernel/tlb_uv.c
parent2acebe9ecb2b77876e87a1480729cfb2db4570dd (diff)
x86, UV: Improve BAU performance and error recovery
- increase performance of the interrupt handler - release timed-out software acknowledge resources - recover from continuous-busy status due to a hardware issue - add a 'throttle' to keep a uvhub from sending more than a specified number of broadcasts concurrently (work around the hardware issue) - provide a 'nobau' boot command line option - rename 'pnode' and 'node' to 'uvhub' (the 'node' terminology is ambiguous) - add some new statistics about the scope of broadcasts, retries, the hardware issue and the 'throttle' - split off new function uv_bau_retry_msg() from uv_bau_process_message() per community coding style feedback. - simplify the argument list to uv_bau_process_message(), per community coding style feedback. Signed-off-by: Cliff Wickman <cpw@sgi.com> Cc: linux-mm@kvack.org Cc: Jack Steiner <steiner@sgi.com> Cc: Russ Anderson <rja@sgi.com> Cc: Mike Travis <travis@sgi.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <E1O25Z4-0004Ur-PB@eag09.americas.sgi.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/kernel/tlb_uv.c')
-rw-r--r--arch/x86/kernel/tlb_uv.c1270
1 files changed, 900 insertions, 370 deletions
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ef68ba48564..414f7c4fe76 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1,7 +1,7 @@
/*
* SGI UltraViolet TLB flush routines.
*
- * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
+ * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
*
* This code is released under the GNU General Public License version 2 or
* later.
@@ -19,44 +19,67 @@
#include <asm/idle.h>
#include <asm/tsc.h>
#include <asm/irq_vectors.h>
+#include <asm/timer.h>
+
+struct msg_desc {
+ struct bau_payload_queue_entry *msg;
+ int msg_slot;
+ int sw_ack_slot;
+ struct bau_payload_queue_entry *va_queue_first;
+ struct bau_payload_queue_entry *va_queue_last;
+};
#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
-static struct bau_control **uv_bau_table_bases __read_mostly;
-static int uv_bau_retry_limit __read_mostly;
+static int uv_bau_max_concurrent __read_mostly;
-/* base pnode in this partition */
-static int uv_partition_base_pnode __read_mostly;
+static int nobau;
+static int __init setup_nobau(char *arg)
+{
+ nobau = 1;
+ return 0;
+}
+early_param("nobau", setup_nobau);
-static unsigned long uv_mmask __read_mostly;
+/* base pnode in this partition */
+static int uv_partition_base_pnode __read_mostly;
+/* position of pnode (which is nasid>>1): */
+static int uv_nshift __read_mostly;
+static unsigned long uv_mmask __read_mostly;
static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
static DEFINE_PER_CPU(struct bau_control, bau_control);
+static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
+
+struct reset_args {
+ int sender;
+};
/*
- * Determine the first node on a blade.
+ * Determine the first node on a uvhub. 'Nodes' are used for kernel
+ * memory allocation.
*/
-static int __init blade_to_first_node(int blade)
+static int __init uvhub_to_first_node(int uvhub)
{
int node, b;
for_each_online_node(node) {
b = uv_node_to_blade_id(node);
- if (blade == b)
+ if (uvhub == b)
return node;
}
- return -1; /* shouldn't happen */
+ return -1;
}
/*
- * Determine the apicid of the first cpu on a blade.
+ * Determine the apicid of the first cpu on a uvhub.
*/
-static int __init blade_to_first_apicid(int blade)
+static int __init uvhub_to_first_apicid(int uvhub)
{
int cpu;
for_each_present_cpu(cpu)
- if (blade == uv_cpu_to_blade_id(cpu))
+ if (uvhub == uv_cpu_to_blade_id(cpu))
return per_cpu(x86_cpu_to_apicid, cpu);
return -1;
}
@@ -69,195 +92,459 @@ static int __init blade_to_first_apicid(int blade)
* clear of the Timeout bit (as well) will free the resource. No reply will
* be sent (the hardware will only do one reply per message).
*/
-static void uv_reply_to_message(int resource,
- struct bau_payload_queue_entry *msg,
- struct bau_msg_status *msp)
+static inline void uv_reply_to_message(struct msg_desc *mdp,
+ struct bau_control *bcp)
{
unsigned long dw;
+ struct bau_payload_queue_entry *msg;
- dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
+ msg = mdp->msg;
+ if (!msg->canceled) {
+ dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
+ msg->sw_ack_vector;
+ uv_write_local_mmr(
+ UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
+ }
msg->replied_to = 1;
msg->sw_ack_vector = 0;
- if (msp)
- msp->seen_by.bits = 0;
- uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
}
/*
- * Do all the things a cpu should do for a TLB shootdown message.
- * Other cpu's may come here at the same time for this message.
+ * Process the receipt of a RETRY message
*/
-static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
- int msg_slot, int sw_ack_slot)
+static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
+ struct bau_control *bcp)
{
- unsigned long this_cpu_mask;
- struct bau_msg_status *msp;
- int cpu;
+ int i;
+ int cancel_count = 0;
+ int slot2;
+ unsigned long msg_res;
+ unsigned long mmr = 0;
+ struct bau_payload_queue_entry *msg;
+ struct bau_payload_queue_entry *msg2;
+ struct ptc_stats *stat;
- msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
- cpu = uv_blade_processor_id();
- msg->number_of_cpus =
- uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
- this_cpu_mask = 1UL << cpu;
- if (msp->seen_by.bits & this_cpu_mask)
- return;
- atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
+ msg = mdp->msg;
+ stat = &per_cpu(ptcstats, bcp->cpu);
+ stat->d_retries++;
+ /*
+ * cancel any message from msg+1 to the retry itself
+ */
+ for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
+ if (msg2 > mdp->va_queue_last)
+ msg2 = mdp->va_queue_first;
+ if (msg2 == msg)
+ break;
+
+ /* same conditions for cancellation as uv_do_reset */
+ if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
+ (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
+ msg->sw_ack_vector) == 0) &&
+ (msg2->sending_cpu == msg->sending_cpu) &&
+ (msg2->msg_type != MSG_NOOP)) {
+ slot2 = msg2 - mdp->va_queue_first;
+ mmr = uv_read_local_mmr
+ (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+ msg_res = ((msg2->sw_ack_vector << 8) |
+ msg2->sw_ack_vector);
+ /*
+ * This is a message retry; clear the resources held
+ * by the previous message only if they timed out.
+ * If it has not timed out we have an unexpected
+ * situation to report.
+ */
+ if (mmr & (msg_res << 8)) {
+ /*
+ * is the resource timed out?
+ * make everyone ignore the cancelled message.
+ */
+ msg2->canceled = 1;
+ stat->d_canceled++;
+ cancel_count++;
+ uv_write_local_mmr(
+ UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+ (msg_res << 8) | msg_res);
+ } else
+ printk(KERN_INFO "note bau retry: no effect\n");
+ }
+ }
+ if (!cancel_count)
+ stat->d_nocanceled++;
+}
- if (msg->replied_to == 1)
- return;
+/*
+ * Do all the things a cpu should do for a TLB shootdown message.
+ * Other cpu's may come here at the same time for this message.
+ */
+static void uv_bau_process_message(struct msg_desc *mdp,
+ struct bau_control *bcp)
+{
+ int msg_ack_count;
+ short socket_ack_count = 0;
+ struct ptc_stats *stat;
+ struct bau_payload_queue_entry *msg;
+ struct bau_control *smaster = bcp->socket_master;
+ /*
+ * This must be a normal message, or retry of a normal message
+ */
+ msg = mdp->msg;
+ stat = &per_cpu(ptcstats, bcp->cpu);
if (msg->address == TLB_FLUSH_ALL) {
local_flush_tlb();
- __get_cpu_var(ptcstats).alltlb++;
+ stat->d_alltlb++;
} else {
__flush_tlb_one(msg->address);
- __get_cpu_var(ptcstats).onetlb++;
+ stat->d_onetlb++;
}
+ stat->d_requestee++;
+
+ /*
+ * One cpu on each uvhub has the additional job on a RETRY
+ * of releasing the resource held by the message that is
+ * being retried. That message is identified by sending
+ * cpu number.
+ */
+ if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
+ uv_bau_process_retry_msg(mdp, bcp);
- __get_cpu_var(ptcstats).requestee++;
+ /*
+ * This is a sw_ack message, so we have to reply to it.
+ * Count each responding cpu on the socket. This avoids
+ * pinging the count's cache line back and forth between
+ * the sockets.
+ */
+ socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
+ &smaster->socket_acknowledge_count[mdp->msg_slot]);
+ if (socket_ack_count == bcp->cpus_in_socket) {
+ /*
+ * Both sockets dump their completed count total into
+ * the message's count.
+ */
+ smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
+ msg_ack_count = atomic_add_short_return(socket_ack_count,
+ (struct atomic_short *)&msg->acknowledge_count);
+
+ if (msg_ack_count == bcp->cpus_in_uvhub) {
+ /*
+ * All cpus in uvhub saw it; reply
+ */
+ uv_reply_to_message(mdp, bcp);
+ }
+ }
- atomic_inc_short(&msg->acknowledge_count);
- if (msg->number_of_cpus == msg->acknowledge_count)
- uv_reply_to_message(sw_ack_slot, msg, msp);
+ return;
}
/*
- * Examine the payload queue on one distribution node to see
- * which messages have not been seen, and which cpu(s) have not seen them.
+ * Determine the first cpu on a uvhub.
+ */
+static int uvhub_to_first_cpu(int uvhub)
+{
+ int cpu;
+ for_each_present_cpu(cpu)
+ if (uvhub == uv_cpu_to_blade_id(cpu))
+ return cpu;
+ return -1;
+}
+
+/*
+ * Last resort when we get a large number of destination timeouts is
+ * to clear resources held by a given cpu.
+ * Do this with IPI so that all messages in the BAU message queue
+ * can be identified by their nonzero sw_ack_vector field.
*
- * Returns the number of cpu's that have not responded.
+ * This is entered for a single cpu on the uvhub.
+ * The sender want's this uvhub to free a specific message's
+ * sw_ack resources.
*/
-static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
+static void
+uv_do_reset(void *ptr)
{
- struct bau_payload_queue_entry *msg;
- struct bau_msg_status *msp;
- int count = 0;
int i;
- int j;
+ int slot;
+ int count = 0;
+ unsigned long mmr;
+ unsigned long msg_res;
+ struct bau_control *bcp;
+ struct reset_args *rap;
+ struct bau_payload_queue_entry *msg;
+ struct ptc_stats *stat;
- for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
- msg++, i++) {
- if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
- msp = bau_tablesp->msg_statuses + i;
- printk(KERN_DEBUG
- "blade %d: address:%#lx %d of %d, not cpu(s): ",
- i, msg->address, msg->acknowledge_count,
- msg->number_of_cpus);
- for (j = 0; j < msg->number_of_cpus; j++) {
- if (!((1L << j) & msp->seen_by.bits)) {
- count++;
- printk("%d ", j);
- }
+ bcp = &per_cpu(bau_control, smp_processor_id());
+ rap = (struct reset_args *)ptr;
+ stat = &per_cpu(ptcstats, bcp->cpu);
+ stat->d_resets++;
+
+ /*
+ * We're looking for the given sender, and
+ * will free its sw_ack resource.
+ * If all cpu's finally responded after the timeout, its
+ * message 'replied_to' was set.
+ */
+ for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
+ /* uv_do_reset: same conditions for cancellation as
+ uv_bau_process_retry_msg() */
+ if ((msg->replied_to == 0) &&
+ (msg->canceled == 0) &&
+ (msg->sending_cpu == rap->sender) &&
+ (msg->sw_ack_vector) &&
+ (msg->msg_type != MSG_NOOP)) {
+ /*
+ * make everyone else ignore this message
+ */
+ msg->canceled = 1;
+ slot = msg - bcp->va_queue_first;
+ count++;
+ /*
+ * only reset the resource if it is still pending
+ */
+ mmr = uv_read_local_mmr
+ (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+ msg_res = ((msg->sw_ack_vector << 8) |
+ msg->sw_ack_vector);
+ if (mmr & msg_res) {
+ stat->d_rcanceled++;
+ uv_write_local_mmr(
+ UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+ msg_res);
}
- printk("\n");
}
}
- return count;
+ return;
}
/*
- * Examine the payload queue on all the distribution nodes to see
- * which messages have not been seen, and which cpu(s) have not seen them.
- *
- * Returns the number of cpu's that have not responded.
+ * Use IPI to get all target uvhubs to release resources held by
+ * a given sending cpu number.
*/
-static int uv_examine_destinations(struct bau_target_nodemask *distribution)
+static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
+ int sender)
{
- int sender;
- int i;
- int count = 0;
+ int uvhub;
+ int cpu;
+ cpumask_t mask;
+ struct reset_args reset_args;
- sender = smp_processor_id();
- for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
- if (!bau_node_isset(i, distribution))
+ reset_args.sender = sender;
+
+ cpus_clear(mask);
+ /* find a single cpu for each uvhub in this distribution mask */
+ for (uvhub = 0;
+ uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
+ uvhub++) {
+ if (!bau_uvhub_isset(uvhub, distribution))
continue;
- count += uv_examine_destination(uv_bau_table_bases[i], sender);
+ /* find a cpu for this uvhub */
+ cpu = uvhub_to_first_cpu(uvhub);
+ cpu_set(cpu, mask);
}
- return count;
+ /* IPI all cpus; Preemption is already disabled */
+ smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
+ return;
+}
+
+static inline unsigned long
+cycles_2_us(unsigned long long cyc)
+{
+ unsigned long long ns;
+ unsigned long us;
+ ns = (cyc * per_cpu(cyc2ns, smp_processor_id()))
+ >> CYC2NS_SCALE_FACTOR;
+ us = ns / 1000;
+ return us;
}
/*
- * wait for completion of a broadcast message
- *
- * return COMPLETE, RETRY or GIVEUP
+ * wait for all cpus on this hub to finish their sends and go quiet
+ * leaves uvhub_quiesce set so that no new broadcasts are started by
+ * bau_flush_send_and_wait()
+ */
+static inline void
+quiesce_local_uvhub(struct bau_control *hmaster)
+{
+ atomic_add_short_return(1, (struct atomic_short *)
+ &hmaster->uvhub_quiesce);
+}
+
+/*
+ * mark this quiet-requestor as done
+ */
+static inline void
+end_uvhub_quiesce(struct bau_control *hmaster)
+{
+ atomic_add_short_return(-1, (struct atomic_short *)
+ &hmaster->uvhub_quiesce);
+}
+
+/*
+ * Wait for completion of a broadcast software ack message
+ * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
*/
static int uv_wait_completion(struct bau_desc *bau_desc,
- unsigned long mmr_offset, int right_shift)
+ unsigned long mmr_offset, int right_shift, int this_cpu,
+ struct bau_control *bcp, struct bau_control *smaster, long try)
{
- int exams = 0;
- long destination_timeouts = 0;
- long source_timeouts = 0;
+ int relaxes = 0;
unsigned long descriptor_status;
+ unsigned long mmr;
+ unsigned long mask;
+ cycles_t ttime;
+ cycles_t timeout_time;
+ struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
+ struct bau_control *hmaster;
+ hmaster = bcp->uvhub_master;
+ timeout_time = get_cycles() + bcp->timeout_interval;
+
+ /* spin on the status MMR, waiting for it to go idle */
while ((descriptor_status = (((unsigned long)
uv_read_local_mmr(mmr_offset) >>
right_shift) & UV_ACT_STATUS_MASK)) !=
DESC_STATUS_IDLE) {
- if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
- source_timeouts++;
- if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
- source_timeouts = 0;
- __get_cpu_var(ptcstats).s_retry++;
- return FLUSH_RETRY;
- }
/*
- * spin here looking for progress at the destinations
+ * Our software ack messages may be blocked because there are
+ * no swack resources available. As long as none of them
+ * has timed out hardware will NACK our message and its
+ * state will stay IDLE.
*/
- if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
- destination_timeouts++;
- if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
- /*
- * returns number of cpus not responding
- */
- if (uv_examine_destinations
- (&bau_desc->distribution) == 0) {
- __get_cpu_var(ptcstats).d_retry++;
- return FLUSH_RETRY;
- }
- exams++;
- if (exams >= uv_bau_retry_limit) {
- printk(KERN_DEBUG
- "uv_flush_tlb_others");
- printk("giving up on cpu %d\n",
- smp_processor_id());
+ if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+ stat->s_stimeout++;
+ return FLUSH_GIVEUP;
+ } else if (descriptor_status ==
+ DESC_STATUS_DESTINATION_TIMEOUT) {
+ stat->s_dtimeout++;
+ ttime = get_cycles();
+
+ /*
+ * Our retries may be blocked by all destination
+ * swack resources being consumed, and a timeout
+ * pending. In that case hardware returns the
+ * ERROR that looks like a destination timeout.
+ */
+ if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_PLUGGED;
+ }
+
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_TIMEOUT;
+ } else {
+ /*
+ * descriptor_status is still BUSY
+ */
+ cpu_relax();
+ relaxes++;
+ if (relaxes >= 10000) {
+ relaxes = 0;
+ if (get_cycles() > timeout_time) {
+ quiesce_local_uvhub(hmaster);
+
+ /* single-thread the register change */
+ spin_lock(&hmaster->masks_lock);
+ mmr = uv_read_local_mmr(mmr_offset);
+ mask = 0UL;
+ mask |= (3UL < right_shift);
+ mask = ~mask;
+ mmr &= mask;
+ uv_write_local_mmr(mmr_offset, mmr);
+ spin_unlock(&hmaster->masks_lock);
+ end_uvhub_quiesce(hmaster);
+ stat->s_busy++;
return FLUSH_GIVEUP;
}
- /*
- * delays can hang the simulator
- udelay(1000);
- */
- destination_timeouts = 0;
}
}
- cpu_relax();
}
+ bcp->conseccompletes++;
return FLUSH_COMPLETE;
}
+static inline cycles_t
+sec_2_cycles(unsigned long sec)
+{
+ unsigned long ns;
+ cycles_t cyc;
+
+ ns = sec * 1000000000;
+ cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+ return cyc;
+}
+
+/*
+ * conditionally add 1 to *v, unless *v is >= u
+ * return 0 if we cannot add 1 to *v because it is >= u
+ * return 1 if we can add 1 to *v because it is < u
+ * the add is atomic
+ *
+ * This is close to atomic_add_unless(), but this allows the 'u' value
+ * to be lowered below the current 'v'. atomic_add_unless can only stop
+ * on equal.
+ */
+static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+{
+ spin_lock(lock);
+ if (atomic_read(v) >= u) {
+ spin_unlock(lock);
+ return 0;
+ }
+ atomic_inc(v);
+ spin_unlock(lock);
+ return 1;
+}
+
/**
* uv_flush_send_and_wait
*
- * Send a broadcast and wait for a broadcast message to complete.
+ * Send a broadcast and wait for it to complete.
*
- * The flush_mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * cpus that are on the local uvhub.
*
- * Returns NULL if all remote flushing was done. The mask is zeroed.
+ * Returns NULL if all flushing represented in the mask was done. The mask
+ * is zeroed.
* Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set.
+ * mask will have some bits still set, representing any cpus on the local
+ * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
*/
-const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
- struct bau_desc *bau_desc,
- struct cpumask *flush_mask)
+const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
+ struct cpumask *flush_mask,
+ struct bau_control *bcp)
{
- int completion_status = 0;
int right_shift;
- int tries = 0;
- int pnode;
+ int uvhub;
int bit;
+ int completion_status = 0;
+ int seq_number = 0;
+ long try = 0;
+ int cpu = bcp->uvhub_cpu;
+ int this_cpu = bcp->cpu;
+ int this_uvhub = bcp->uvhub;
unsigned long mmr_offset;
unsigned long index;
cycles_t time1;
cycles_t time2;
+ struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+ struct bau_control *smaster = bcp->socket_master;
+ struct bau_control *hmaster = bcp->uvhub_master;
+
+ /*
+ * Spin here while there are hmaster->max_concurrent or more active
+ * descriptors. This is the per-uvhub 'throttle'.
+ */
+ if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+ &hmaster->active_descriptor_count,
+ hmaster->max_concurrent)) {
+ stat->s_throttles++;
+ do {
+ cpu_relax();
+ } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+ &hmaster->active_descriptor_count,
+ hmaster->max_concurrent));
+ }
+
+ while (hmaster->uvhub_quiesce)
+ cpu_relax();
if (cpu < UV_CPUS_PER_ACT_STATUS) {
mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -269,24 +556,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
}
time1 = get_cycles();
do {
- tries++;
+ /*
+ * Every message from any given cpu gets a unique message
+ * sequence number. But retries use that same number.
+ * Our message may have timed out at the destination because
+ * all sw-ack resources are in use and there is a timeout
+ * pending there. In that case, our last send never got
+ * placed into the queue and we need to persist until it
+ * does.
+ *
+ * Make any retry a type MSG_RETRY so that the destination will
+ * free any resource held by a previous message from this cpu.
+ */
+ if (try == 0) {
+ /* use message type set by the caller the first time */
+ seq_number = bcp->message_number++;
+ } else {
+ /* use RETRY type on all the rest; same sequence */
+ bau_desc->header.msg_type = MSG_RETRY;
+ stat->s_retry_messages++;
+ }
+ bau_desc->header.sequence = seq_number;
index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
- cpu;
+ bcp->uvhub_cpu;
+ bcp->send_message = get_cycles();
+
uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+
+ try++;
completion_status = uv_wait_completion(bau_desc, mmr_offset,
- right_shift);
- } while (completion_status == FLUSH_RETRY);
+ right_shift, this_cpu, bcp, smaster, try);
+
+ if (completion_status == FLUSH_RETRY_PLUGGED) {
+ /*
+ * Our retries may be blocked by all destination swack
+ * resources being consumed, and a timeout pending. In
+ * that case hardware immediately returns the ERROR
+ * that looks like a destination timeout.
+ */
+ udelay(TIMEOUT_DELAY);
+ bcp->plugged_tries++;
+ if (bcp->plugged_tries >= PLUGSB4RESET) {
+ bcp->plugged_tries = 0;
+ quiesce_local_uvhub(hmaster);
+ spin_lock(&hmaster->queue_lock);
+ uv_reset_with_ipi(&bau_desc->distribution,
+ this_cpu);
+ spin_unlock(&hmaster->queue_lock);
+ end_uvhub_quiesce(hmaster);
+ bcp->ipi_attempts++;
+ stat->s_resets_plug++;
+ }
+ } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
+ hmaster->max_concurrent = 1;
+ bcp->timeout_tries++;
+ udelay(TIMEOUT_DELAY);
+ if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
+ bcp->timeout_tries = 0;
+ quiesce_local_uvhub(hmaster);
+ spin_lock(&hmaster->queue_lock);
+ uv_reset_with_ipi(&bau_desc->distribution,
+ this_cpu);
+ spin_unlock(&hmaster->queue_lock);
+ end_uvhub_quiesce(hmaster);
+ bcp->ipi_attempts++;
+ stat->s_resets_timeout++;
+ }
+ }
+ if (bcp->ipi_attempts >= 3) {
+ bcp->ipi_attempts = 0;
+ completion_status = FLUSH_GIVEUP;
+ break;
+ }
+ cpu_relax();
+ } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
+ (completion_status == FLUSH_RETRY_TIMEOUT));
time2 = get_cycles();
- __get_cpu_var(ptcstats).sflush += (time2 - time1);
- if (tries > 1)
- __get_cpu_var(ptcstats).retriesok++;
- if (completion_status == FLUSH_GIVEUP) {
+ if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
+ && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
+ hmaster->max_concurrent++;
+
+ /*
+ * hold any cpu not timing out here; no other cpu currently held by
+ * the 'throttle' should enter the activation code
+ */
+ while (hmaster->uvhub_quiesce)
+ cpu_relax();
+ atomic_dec(&hmaster->active_descriptor_count);
+
+ /* guard against cycles wrap */
+ if (time2 > time1)
+ stat->s_time += (time2 - time1);
+ else
+ stat->s_requestor--; /* don't count this one */
+ if (completion_status == FLUSH_COMPLETE && try > 1)
+ stat->s_retriesok++;
+ else if (completion_status == FLUSH_GIVEUP) {
/*
* Cause the caller to do an IPI-style TLB shootdown on
- * the cpu's, all of which are still in the mask.
+ * the target cpu's, all of which are still in the mask.
*/
- __get_cpu_var(ptcstats).ptc_i++;
+ stat->s_giveup++;
return flush_mask;
}
@@ -295,18 +666,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
* use the IPI method of shootdown on them.
*/
for_each_cpu(bit, flush_mask) {
- pnode = uv_cpu_to_pnode(bit);
- if (pnode == this_pnode)
+ uvhub = uv_cpu_to_blade_id(bit);
+ if (uvhub == this_uvhub)
continue;
cpumask_clear_cpu(bit, flush_mask);
}
if (!cpumask_empty(flush_mask))
return flush_mask;
+
return NULL;
}
-static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-
/**
* uv_flush_tlb_others - globally purge translation cache of a virtual
* address or all TLB's
@@ -323,8 +693,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
* The caller has derived the cpumask from the mm_struct. This function
* is called only if there are bits set in the mask. (e.g. flush_tlb_page())
*
- * The cpumask is converted into a nodemask of the nodes containing
- * the cpus.
+ * The cpumask is converted into a uvhubmask of the uvhubs containing
+ * those cpus.
*
* Note that this function should be called with preemption disabled.
*
@@ -336,52 +706,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long va, unsigned int cpu)
{
- struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
- int i;
- int bit;
- int pnode;
- int uv_cpu;
- int this_pnode;
+ int remotes;
+ int tcpu;
+ int uvhub;
int locals = 0;
struct bau_desc *bau_desc;
+ struct cpumask *flush_mask;
+ struct ptc_stats *stat;
+ struct bau_control *bcp;
- cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+ if (nobau)
+ return cpumask;
- uv_cpu = uv_blade_processor_id();
- this_pnode = uv_hub_info->pnode;
- bau_desc = __get_cpu_var(bau_control).descriptor_base;
- bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
+ bcp = &per_cpu(bau_control, cpu);
+ /*
+ * Each sending cpu has a per-cpu mask which it fills from the caller's
+ * cpu mask. Only remote cpus are converted to uvhubs and copied.
+ */
+ flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
+ /*
+ * copy cpumask to flush_mask, removing current cpu
+ * (current cpu should already have been flushed by the caller and
+ * should never be returned if we return flush_mask)
+ */
+ cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+ if (cpu_isset(cpu, *cpumask))
+ locals++; /* current cpu was targeted */
- bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+ bau_desc = bcp->descriptor_base;
+ bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
- i = 0;
- for_each_cpu(bit, flush_mask) {
- pnode = uv_cpu_to_pnode(bit);
- BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
- if (pnode == this_pnode) {
+ bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+ remotes = 0;
+ for_each_cpu(tcpu, flush_mask) {
+ uvhub = uv_cpu_to_blade_id(tcpu);
+ if (uvhub == bcp->uvhub) {
locals++;
continue;
}
- bau_node_set(pnode - uv_partition_base_pnode,
- &bau_desc->distribution);
- i++;
+ bau_uvhub_set(uvhub, &bau_desc->distribution);
+ remotes++;
}
- if (i == 0) {
+ if (remotes == 0) {
/*
- * no off_node flushing; return status for local node
+ * No off_hub flushing; return status for local hub.
+ * Return the caller's mask if all were local (the current
+ * cpu may be in that mask).
*/
if (locals)
- return flush_mask;
+ return cpumask;
else
return NULL;
}
- __get_cpu_var(ptcstats).requestor++;
- __get_cpu_var(ptcstats).ntargeted += i;
+ stat = &per_cpu(ptcstats, cpu);
+ stat->s_requestor++;
+ stat->s_ntargcpu += remotes;
+ remotes = bau_uvhub_weight(&bau_desc->distribution);
+ stat->s_ntarguvhub += remotes;
+ if (remotes >= 16)
+ stat->s_ntarguvhub16++;
+ else if (remotes >= 8)
+ stat->s_ntarguvhub8++;
+ else if (remotes >= 4)
+ stat->s_ntarguvhub4++;
+ else if (remotes >= 2)
+ stat->s_ntarguvhub2++;
+ else
+ stat->s_ntarguvhub1++;
bau_desc->payload.address = va;
bau_desc->payload.sending_cpu = cpu;
- return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
+ /*
+ * uv_flush_send_and_wait returns null if all cpu's were messaged, or
+ * the adjusted flush_mask if any cpu's were not messaged.
+ */
+ return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
}
/*
@@ -390,87 +790,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
*
* We received a broadcast assist message.
*
- * Interrupts may have been disabled; this interrupt could represent
+ * Interrupts are disabled; this interrupt could represent
* the receipt of several messages.
*
- * All cores/threads on this node get this interrupt.
- * The last one to see it does the s/w ack.
+ * All cores/threads on this hub get this interrupt.
+ * The last one to see it does the software ack.
* (the resource will not be freed until noninterruptable cpus see this
- * interrupt; hardware will timeout the s/w ack and reply ERROR)
+ * interrupt; hardware may timeout the s/w ack and reply ERROR)
*/
void uv_bau_message_interrupt(struct pt_regs *regs)
{
- struct bau_payload_queue_entry *va_queue_first;
- struct bau_payload_queue_entry *va_queue_last;
- struct bau_payload_queue_entry *msg;
- struct pt_regs *old_regs = set_irq_regs(regs);
- cycles_t time1;
- cycles_t time2;
- int msg_slot;
- int sw_ack_slot;
- int fw;
int count = 0;
- unsigned long local_pnode;
-
- ack_APIC_irq();
- exit_idle();
- irq_enter();
-
- time1 = get_cycles();
-
- local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
-
- va_queue_first = __get_cpu_var(bau_control).va_queue_first;
- va_queue_last = __get_cpu_var(bau_control).va_queue_last;
-
- msg = __get_cpu_var(bau_control).bau_msg_head;
+ cycles_t time_start;
+ struct bau_payload_queue_entry *msg;
+ struct bau_control *bcp;
+ struct ptc_stats *stat;
+ struct msg_desc msgdesc;
+
+ time_start = get_cycles();
+ bcp = &per_cpu(bau_control, smp_processor_id());
+ stat = &per_cpu(ptcstats, smp_processor_id());
+ msgdesc.va_queue_first = bcp->va_queue_first;
+ msgdesc.va_queue_last = bcp->va_queue_last;
+ msg = bcp->bau_msg_head;
while (msg->sw_ack_vector) {
count++;
- fw = msg->sw_ack_vector;
- msg_slot = msg - va_queue_first;
- sw_ack_slot = ffs(fw) - 1;
-
- uv_bau_process_message(msg, msg_slot, sw_ack_slot);
-
+ msgdesc.msg_slot = msg - msgdesc.va_queue_first;
+ msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
+ msgdesc.msg = msg;
+ uv_bau_process_message(&msgdesc, bcp);
msg++;
- if (msg > va_queue_last)
- msg = va_queue_first;
- __get_cpu_var(bau_control).bau_msg_head = msg;
+ if (msg > msgdesc.va_queue_last)
+ msg = msgdesc.va_queue_first;
+ bcp->bau_msg_head = msg;
}
+ stat->d_time += (get_cycles() - time_start);
if (!count)
- __get_cpu_var(ptcstats).nomsg++;
+ stat->d_nomsg++;
else if (count > 1)
- __get_cpu_var(ptcstats).multmsg++;
-
- time2 = get_cycles();
- __get_cpu_var(ptcstats).dflush += (time2 - time1);
-
- irq_exit();
- set_irq_regs(old_regs);
+ stat->d_multmsg++;
+ ack_APIC_irq();
}
/*
* uv_enable_timeouts
*
- * Each target blade (i.e. blades that have cpu's) needs to have
+ * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
* shootdown message timeouts enabled. The timeout does not cause
* an interrupt, but causes an error message to be returned to
* the sender.
*/
static void uv_enable_timeouts(void)
{
- int blade;
- int nblades;
+ int uvhub;
+ int nuvhubs;
int pnode;
unsigned long mmr_image;
- nblades = uv_num_possible_blades();
+ nuvhubs = uv_num_possible_blades();
- for (blade = 0; blade < nblades; blade++) {
- if (!uv_blade_nr_possible_cpus(blade))
+ for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+ if (!uv_blade_nr_possible_cpus(uvhub))
continue;
- pnode = uv_blade_to_pnode(blade);
+ pnode = uv_blade_to_pnode(uvhub);
mmr_image =
uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
/*
@@ -523,9 +906,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
{
}
+static inline unsigned long long
+millisec_2_cycles(unsigned long millisec)
+{
+ unsigned long ns;
+ unsigned long long cyc;
+
+ ns = millisec * 1000;
+ cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+ return cyc;
+}
+
/*
- * Display the statistics thru /proc
- * data points to the cpu number
+ * Display the statistics thru /proc.
+ * 'data' points to the cpu number
*/
static int uv_ptc_seq_show(struct seq_file *file, void *data)
{
@@ -536,78 +930,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
if (!cpu) {
seq_printf(file,
- "# cpu requestor requestee one all sretry dretry ptc_i ");
+ "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
seq_printf(file,
- "sw_ack sflush dflush sok dnomsg dmult starget\n");
+ "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+ seq_printf(file,
+ "retries rok resetp resett giveup sto bz throt ");
+ seq_printf(file,
+ "sw_ack recv rtime all ");
+ seq_printf(file,
+ "one mult none retry canc nocan reset rcan\n");
}