diff options
Diffstat (limited to 'arch/x86/platform/uv')
| -rw-r--r-- | arch/x86/platform/uv/Makefile | 2 | ||||
| -rw-r--r-- | arch/x86/platform/uv/bios_uv.c | 3 | ||||
| -rw-r--r-- | arch/x86/platform/uv/tlb_uv.c | 1948 | ||||
| -rw-r--r-- | arch/x86/platform/uv/uv_irq.c | 25 | ||||
| -rw-r--r-- | arch/x86/platform/uv/uv_nmi.c | 727 | ||||
| -rw-r--r-- | arch/x86/platform/uv/uv_sysfs.c | 2 | ||||
| -rw-r--r-- | arch/x86/platform/uv/uv_time.c | 38 | 
7 files changed, 1991 insertions, 754 deletions
diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile index 6c40995fefb..52079bebd01 100644 --- a/arch/x86/platform/uv/Makefile +++ b/arch/x86/platform/uv/Makefile @@ -1 +1 @@ -obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o +obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o uv_nmi.o diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 8bc57baaa9a..1584cbed0dc 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -20,6 +20,7 @@   */  #include <linux/efi.h> +#include <linux/export.h>  #include <asm/efi.h>  #include <linux/io.h>  #include <asm/uv/bios.h> @@ -38,7 +39,7 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)  		 */  		return BIOS_STATUS_UNIMPLEMENTED; -	ret = efi_call6((void *)__va(tab->function), (u64)which, +	ret = efi_call((void *)__va(tab->function), (u64)which,  			a1, a2, a3, a4, a5);  	return ret;  } diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index a318194002b..dfe605ac1bc 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1,7 +1,7 @@  /*   *	SGI UltraViolet TLB flush routines.   * - *	(c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI. + *	(c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.   *   *	This code is released under the GNU General Public License version 2 or   *	later. @@ -11,6 +11,7 @@  #include <linux/debugfs.h>  #include <linux/kernel.h>  #include <linux/slab.h> +#include <linux/delay.h>  #include <asm/mmu_context.h>  #include <asm/uv/uv.h> @@ -34,27 +35,79 @@ static int timeout_base_ns[] = {  		5242880,  		167772160  }; +  static int timeout_us;  static int nobau; -static int baudisabled; -static spinlock_t disable_lock; +static int nobau_perm;  static cycles_t congested_cycles;  /* tunables: */ -static int max_bau_concurrent = MAX_BAU_CONCURRENT; -static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT; -static int plugged_delay = PLUGGED_DELAY; -static int plugsb4reset = PLUGSB4RESET; -static int timeoutsb4reset = TIMEOUTSB4RESET; -static int ipi_reset_limit = IPI_RESET_LIMIT; -static int complete_threshold = COMPLETE_THRESHOLD; -static int congested_response_us = CONGESTED_RESPONSE_US; -static int congested_reps = CONGESTED_REPS; -static int congested_period = CONGESTED_PERIOD; +static int max_concurr		= MAX_BAU_CONCURRENT; +static int max_concurr_const	= MAX_BAU_CONCURRENT; +static int plugged_delay	= PLUGGED_DELAY; +static int plugsb4reset		= PLUGSB4RESET; +static int giveup_limit		= GIVEUP_LIMIT; +static int timeoutsb4reset	= TIMEOUTSB4RESET; +static int ipi_reset_limit	= IPI_RESET_LIMIT; +static int complete_threshold	= COMPLETE_THRESHOLD; +static int congested_respns_us	= CONGESTED_RESPONSE_US; +static int congested_reps	= CONGESTED_REPS; +static int disabled_period	= DISABLED_PERIOD; + +static struct tunables tunables[] = { +	{&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ +	{&plugged_delay, PLUGGED_DELAY}, +	{&plugsb4reset, PLUGSB4RESET}, +	{&timeoutsb4reset, TIMEOUTSB4RESET}, +	{&ipi_reset_limit, IPI_RESET_LIMIT}, +	{&complete_threshold, COMPLETE_THRESHOLD}, +	{&congested_respns_us, CONGESTED_RESPONSE_US}, +	{&congested_reps, CONGESTED_REPS}, +	{&disabled_period, DISABLED_PERIOD}, +	{&giveup_limit, GIVEUP_LIMIT} +}; +  static struct dentry *tunables_dir;  static struct dentry *tunables_file; -static int __init setup_nobau(char *arg) +/* these correspond to the statistics printed by ptc_seq_show() */ +static char *stat_description[] = { +	"sent:     number of shootdown messages sent", +	"stime:    time spent sending messages", +	"numuvhubs: number of hubs targeted with shootdown", +	"numuvhubs16: number times 16 or more hubs targeted", +	"numuvhubs8: number times 8 or more hubs targeted", +	"numuvhubs4: number times 4 or more hubs targeted", +	"numuvhubs2: number times 2 or more hubs targeted", +	"numuvhubs1: number times 1 hub targeted", +	"numcpus:  number of cpus targeted with shootdown", +	"dto:      number of destination timeouts", +	"retries:  destination timeout retries sent", +	"rok:   :  destination timeouts successfully retried", +	"resetp:   ipi-style resource resets for plugs", +	"resett:   ipi-style resource resets for timeouts", +	"giveup:   fall-backs to ipi-style shootdowns", +	"sto:      number of source timeouts", +	"bz:       number of stay-busy's", +	"throt:    number times spun in throttle", +	"swack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE", +	"recv:     shootdown messages received", +	"rtime:    time spent processing messages", +	"all:      shootdown all-tlb messages", +	"one:      shootdown one-tlb messages", +	"mult:     interrupts that found multiple messages", +	"none:     interrupts that found no messages", +	"retry:    number of retry messages processed", +	"canc:     number messages canceled by retries", +	"nocan:    number retries that found nothing to cancel", +	"reset:    number of ipi-style reset requests processed", +	"rcan:     number messages canceled by reset requests", +	"disable:  number times use of the BAU was disabled", +	"enable:   number times use of the BAU was re-enabled" +}; + +static int __init +setup_nobau(char *arg)  {  	nobau = 1;  	return 0; @@ -62,15 +115,46 @@ static int __init setup_nobau(char *arg)  early_param("nobau", setup_nobau);  /* base pnode in this partition */ -static int uv_partition_base_pnode __read_mostly; -/* position of pnode (which is nasid>>1): */ -static int uv_nshift __read_mostly; -static unsigned long uv_mmask __read_mostly; +static int uv_base_pnode __read_mostly;  static DEFINE_PER_CPU(struct ptc_stats, ptcstats);  static DEFINE_PER_CPU(struct bau_control, bau_control);  static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); +static void +set_bau_on(void) +{ +	int cpu; +	struct bau_control *bcp; + +	if (nobau_perm) { +		pr_info("BAU not initialized; cannot be turned on\n"); +		return; +	} +	nobau = 0; +	for_each_present_cpu(cpu) { +		bcp = &per_cpu(bau_control, cpu); +		bcp->nobau = 0; +	} +	pr_info("BAU turned on\n"); +	return; +} + +static void +set_bau_off(void) +{ +	int cpu; +	struct bau_control *bcp; + +	nobau = 1; +	for_each_present_cpu(cpu) { +		bcp = &per_cpu(bau_control, cpu); +		bcp->nobau = 1; +	} +	pr_info("BAU turned off\n"); +	return; +} +  /*   * Determine the first node on a uvhub. 'Nodes' are used for kernel   * memory allocation. @@ -108,60 +192,53 @@ static int __init uvhub_to_first_apicid(int uvhub)   * clear of the Timeout bit (as well) will free the resource. No reply will   * be sent (the hardware will only do one reply per message).   */ -static inline void uv_reply_to_message(struct msg_desc *mdp, -				       struct bau_control *bcp) +static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp, +						int do_acknowledge)  {  	unsigned long dw; -	struct bau_payload_queue_entry *msg; +	struct bau_pq_entry *msg;  	msg = mdp->msg; -	if (!msg->canceled) { -		dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) | -						msg->sw_ack_vector; -		uv_write_local_mmr( -				UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); +	if (!msg->canceled && do_acknowledge) { +		dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; +		write_mmr_sw_ack(dw);  	}  	msg->replied_to = 1; -	msg->sw_ack_vector = 0; +	msg->swack_vec = 0;  }  /*   * Process the receipt of a RETRY message   */ -static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, -					    struct bau_control *bcp) +static void bau_process_retry_msg(struct msg_desc *mdp, +					struct bau_control *bcp)  {  	int i;  	int cancel_count = 0; -	int slot2;  	unsigned long msg_res;  	unsigned long mmr = 0; -	struct bau_payload_queue_entry *msg; -	struct bau_payload_queue_entry *msg2; -	struct ptc_stats *stat; +	struct bau_pq_entry *msg = mdp->msg; +	struct bau_pq_entry *msg2; +	struct ptc_stats *stat = bcp->statp; -	msg = mdp->msg; -	stat = bcp->statp;  	stat->d_retries++;  	/*  	 * cancel any message from msg+1 to the retry itself  	 */  	for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) { -		if (msg2 > mdp->va_queue_last) -			msg2 = mdp->va_queue_first; +		if (msg2 > mdp->queue_last) +			msg2 = mdp->queue_first;  		if (msg2 == msg)  			break; -		/* same conditions for cancellation as uv_do_reset */ +		/* same conditions for cancellation as do_reset */  		if ((msg2->replied_to == 0) && (msg2->canceled == 0) && -		    (msg2->sw_ack_vector) && ((msg2->sw_ack_vector & -			msg->sw_ack_vector) == 0) && +		    (msg2->swack_vec) && ((msg2->swack_vec & +			msg->swack_vec) == 0) &&  		    (msg2->sending_cpu == msg->sending_cpu) &&  		    (msg2->msg_type != MSG_NOOP)) { -			slot2 = msg2 - mdp->va_queue_first; -			mmr = uv_read_local_mmr -				(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); -			msg_res = msg2->sw_ack_vector; +			mmr = read_mmr_sw_ack(); +			msg_res = msg2->swack_vec;  			/*  			 * This is a message retry; clear the resources held  			 * by the previous message only if they timed out. @@ -169,17 +246,16 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,  			 * situation to report.  			 */  			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { +				unsigned long mr;  				/* -				 * is the resource timed out? -				 * make everyone ignore the cancelled message. +				 * Is the resource timed out? +				 * Make everyone ignore the cancelled message.  				 */  				msg2->canceled = 1;  				stat->d_canceled++;  				cancel_count++; -				uv_write_local_mmr( -				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, -					(msg_res << UV_SW_ACK_NPENDING) | -					 msg_res); +				mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res; +				write_mmr_sw_ack(mr);  			}  		}  	} @@ -191,20 +267,19 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,   * Do all the things a cpu should do for a TLB shootdown message.   * Other cpu's may come here at the same time for this message.   */ -static void uv_bau_process_message(struct msg_desc *mdp, -				   struct bau_control *bcp) +static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, +						int do_acknowledge)  { -	int msg_ack_count;  	short socket_ack_count = 0; -	struct ptc_stats *stat; -	struct bau_payload_queue_entry *msg; +	short *sp; +	struct atomic_short *asp; +	struct ptc_stats *stat = bcp->statp; +	struct bau_pq_entry *msg = mdp->msg;  	struct bau_control *smaster = bcp->socket_master;  	/*  	 * This must be a normal message, or retry of a normal message  	 */ -	msg = mdp->msg; -	stat = bcp->statp;  	if (msg->address == TLB_FLUSH_ALL) {  		local_flush_tlb();  		stat->d_alltlb++; @@ -221,30 +296,33 @@ static void uv_bau_process_message(struct msg_desc *mdp,  	 * cpu number.  	 */  	if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master) -		uv_bau_process_retry_msg(mdp, bcp); +		bau_process_retry_msg(mdp, bcp);  	/* -	 * This is a sw_ack message, so we have to reply to it. +	 * This is a swack message, so we have to reply to it.  	 * Count each responding cpu on the socket. This avoids  	 * pinging the count's cache line back and forth between  	 * the sockets.  	 */ -	socket_ack_count = atomic_add_short_return(1, (struct atomic_short *) -			&smaster->socket_acknowledge_count[mdp->msg_slot]); +	sp = &smaster->socket_acknowledge_count[mdp->msg_slot]; +	asp = (struct atomic_short *)sp; +	socket_ack_count = atom_asr(1, asp);  	if (socket_ack_count == bcp->cpus_in_socket) { +		int msg_ack_count;  		/*  		 * Both sockets dump their completed count total into  		 * the message's count.  		 */ -		smaster->socket_acknowledge_count[mdp->msg_slot] = 0; -		msg_ack_count = atomic_add_short_return(socket_ack_count, -				(struct atomic_short *)&msg->acknowledge_count); +		*sp = 0; +		asp = (struct atomic_short *)&msg->acknowledge_count; +		msg_ack_count = atom_asr(socket_ack_count, asp);  		if (msg_ack_count == bcp->cpus_in_uvhub) {  			/*  			 * All cpus in uvhub saw it; reply +			 * (unless we are in the UV2 workaround)  			 */ -			uv_reply_to_message(mdp, bcp); +			reply_to_message(mdp, bcp, do_acknowledge);  		}  	} @@ -252,14 +330,18 @@ static void uv_bau_process_message(struct msg_desc *mdp,  }  /* - * Determine the first cpu on a uvhub. + * Determine the first cpu on a pnode.   */ -static int uvhub_to_first_cpu(int uvhub) +static int pnode_to_first_cpu(int pnode, struct bau_control *smaster)  {  	int cpu; -	for_each_present_cpu(cpu) -		if (uvhub == uv_cpu_to_blade_id(cpu)) +	struct hub_and_pnode *hpp; + +	for_each_present_cpu(cpu) { +		hpp = &smaster->thp[cpu]; +		if (pnode == hpp->pnode)  			return cpu; +	}  	return -1;  } @@ -267,62 +349,51 @@ static int uvhub_to_first_cpu(int uvhub)   * Last resort when we get a large number of destination timeouts is   * to clear resources held by a given cpu.   * Do this with IPI so that all messages in the BAU message queue - * can be identified by their nonzero sw_ack_vector field. + * can be identified by their nonzero swack_vec field.   *   * This is entered for a single cpu on the uvhub.   * The sender want's this uvhub to free a specific message's - * sw_ack resources. + * swack resources.   */ -static void -uv_do_reset(void *ptr) +static void do_reset(void *ptr)  {  	int i; -	int slot; -	int count = 0; -	unsigned long mmr; -	unsigned long msg_res; -	struct bau_control *bcp; -	struct reset_args *rap; -	struct bau_payload_queue_entry *msg; -	struct ptc_stats *stat; +	struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id()); +	struct reset_args *rap = (struct reset_args *)ptr; +	struct bau_pq_entry *msg; +	struct ptc_stats *stat = bcp->statp; -	bcp = &per_cpu(bau_control, smp_processor_id()); -	rap = (struct reset_args *)ptr; -	stat = bcp->statp;  	stat->d_resets++; -  	/*  	 * We're looking for the given sender, and -	 * will free its sw_ack resource. +	 * will free its swack resource.  	 * If all cpu's finally responded after the timeout, its  	 * message 'replied_to' was set.  	 */ -	for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { -		/* uv_do_reset: same conditions for cancellation as -		   uv_bau_process_retry_msg() */ +	for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { +		unsigned long msg_res; +		/* do_reset: same conditions for cancellation as +		   bau_process_retry_msg() */  		if ((msg->replied_to == 0) &&  		    (msg->canceled == 0) &&  		    (msg->sending_cpu == rap->sender) && -		    (msg->sw_ack_vector) && +		    (msg->swack_vec) &&  		    (msg->msg_type != MSG_NOOP)) { +			unsigned long mmr; +			unsigned long mr;  			/*  			 * make everyone else ignore this message  			 */  			msg->canceled = 1; -			slot = msg - bcp->va_queue_first; -			count++;  			/*  			 * only reset the resource if it is still pending  			 */ -			mmr = uv_read_local_mmr -					(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); -			msg_res = msg->sw_ack_vector; +			mmr = read_mmr_sw_ack(); +			msg_res = msg->swack_vec; +			mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;  			if (mmr & msg_res) {  				stat->d_rcanceled++; -				uv_write_local_mmr( -				    UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, -					(msg_res << UV_SW_ACK_NPENDING) | -					 msg_res); +				write_mmr_sw_ack(mr);  			}  		}  	} @@ -333,41 +404,78 @@ uv_do_reset(void *ptr)   * Use IPI to get all target uvhubs to release resources held by   * a given sending cpu number.   */ -static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution, -			      int sender) +static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)  { -	int uvhub; -	int cpu; -	cpumask_t mask; +	int pnode; +	int apnode; +	int maskbits; +	int sender = bcp->cpu; +	cpumask_t *mask = bcp->uvhub_master->cpumask; +	struct bau_control *smaster = bcp->socket_master;  	struct reset_args reset_args;  	reset_args.sender = sender; - -	cpus_clear(mask); +	cpus_clear(*mask);  	/* find a single cpu for each uvhub in this distribution mask */ -	for (uvhub = 0; -		    uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE; -		    uvhub++) { -		if (!bau_uvhub_isset(uvhub, distribution)) +	maskbits = sizeof(struct pnmask) * BITSPERBYTE; +	/* each bit is a pnode relative to the partition base pnode */ +	for (pnode = 0; pnode < maskbits; pnode++) { +		int cpu; +		if (!bau_uvhub_isset(pnode, distribution))  			continue; -		/* find a cpu for this uvhub */ -		cpu = uvhub_to_first_cpu(uvhub); -		cpu_set(cpu, mask); +		apnode = pnode + bcp->partition_base_pnode; +		cpu = pnode_to_first_cpu(apnode, smaster); +		cpu_set(cpu, *mask);  	} -	/* IPI all cpus; Preemption is already disabled */ -	smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1); + +	/* IPI all cpus; preemption is already disabled */ +	smp_call_function_many(mask, do_reset, (void *)&reset_args, 1);  	return;  } -static inline unsigned long -cycles_2_us(unsigned long long cyc) +/* + * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative + * number, not an absolute. It converts a duration in cycles to a duration in + * ns. + */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc)  { +	struct cyc2ns_data *data = cyc2ns_read_begin();  	unsigned long long ns; -	unsigned long us; -	ns =  (cyc * per_cpu(cyc2ns, smp_processor_id())) -						>> CYC2NS_SCALE_FACTOR; -	us = ns / 1000; -	return us; + +	ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift); + +	cyc2ns_read_end(data); +	return ns; +} + +/* + * The reverse of the above; converts a duration in ns to a duration in cycles. + */  +static inline unsigned long long ns_2_cycles(unsigned long long ns) +{ +	struct cyc2ns_data *data = cyc2ns_read_begin(); +	unsigned long long cyc; + +	cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul; + +	cyc2ns_read_end(data); +	return cyc; +} + +static inline unsigned long cycles_2_us(unsigned long long cyc) +{ +	return cycles_2_ns(cyc) / NSEC_PER_USEC; +} + +static inline cycles_t sec_2_cycles(unsigned long sec) +{ +	return ns_2_cycles(sec * NSEC_PER_SEC); +} + +static inline unsigned long long usec_2_cycles(unsigned long usec) +{ +	return ns_2_cycles(usec * NSEC_PER_USEC);  }  /* @@ -375,56 +483,56 @@ cycles_2_us(unsigned long long cyc)   * leaves uvhub_quiesce set so that no new broadcasts are started by   * bau_flush_send_and_wait()   */ -static inline void -quiesce_local_uvhub(struct bau_control *hmaster) +static inline void quiesce_local_uvhub(struct bau_control *hmaster)  { -	atomic_add_short_return(1, (struct atomic_short *) -		 &hmaster->uvhub_quiesce); +	atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce);  }  /*   * mark this quiet-requestor as done   */ -static inline void -end_uvhub_quiesce(struct bau_control *hmaster) +static inline void end_uvhub_quiesce(struct bau_control *hmaster)  { -	atomic_add_short_return(-1, (struct atomic_short *) -		&hmaster->uvhub_quiesce); +	atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce); +} + +static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift) +{ +	unsigned long descriptor_status; + +	descriptor_status = uv_read_local_mmr(mmr_offset); +	descriptor_status >>= right_shift; +	descriptor_status &= UV_ACT_STATUS_MASK; +	return descriptor_status;  }  /*   * Wait for completion of a broadcast software ack message   * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP   */ -static int uv_wait_completion(struct bau_desc *bau_desc, -	unsigned long mmr_offset, int right_shift, int this_cpu, -	struct bau_control *bcp, struct bau_control *smaster, long try) +static int uv1_wait_completion(struct bau_desc *bau_desc, +				unsigned long mmr_offset, int right_shift, +				struct bau_control *bcp, long try)  {  	unsigned long descriptor_status; -	cycles_t ttime; +	cycles_t ttm;  	struct ptc_stats *stat = bcp->statp; -	struct bau_control *hmaster; - -	hmaster = bcp->uvhub_master; +	descriptor_status = uv1_read_status(mmr_offset, right_shift);  	/* spin on the status MMR, waiting for it to go idle */ -	while ((descriptor_status = (((unsigned long) -		uv_read_local_mmr(mmr_offset) >> -			right_shift) & UV_ACT_STATUS_MASK)) != -			DESC_STATUS_IDLE) { +	while ((descriptor_status != DS_IDLE)) {  		/* -		 * Our software ack messages may be blocked because there are -		 * no swack resources available.  As long as none of them -		 * has timed out hardware will NACK our message and its -		 * state will stay IDLE. +		 * Our software ack messages may be blocked because +		 * there are no swack resources available.  As long +		 * as none of them has timed out hardware will NACK +		 * our message and its state will stay IDLE.  		 */ -		if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { +		if (descriptor_status == DS_SOURCE_TIMEOUT) {  			stat->s_stimeout++;  			return FLUSH_GIVEUP; -		} else if (descriptor_status == -					DESC_STATUS_DESTINATION_TIMEOUT) { +		} else if (descriptor_status == DS_DESTINATION_TIMEOUT) {  			stat->s_dtimeout++; -			ttime = get_cycles(); +			ttm = get_cycles();  			/*  			 * Our retries may be blocked by all destination @@ -432,8 +540,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,  			 * pending.  In that case hardware returns the  			 * ERROR that looks like a destination timeout.  			 */ -			if (cycles_2_us(ttime - bcp->send_message) < -							timeout_us) { +			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {  				bcp->conseccompletes = 0;  				return FLUSH_RETRY_PLUGGED;  			} @@ -446,117 +553,319 @@ static int uv_wait_completion(struct bau_desc *bau_desc,  			 */  			cpu_relax();  		} +		descriptor_status = uv1_read_status(mmr_offset, right_shift);  	}  	bcp->conseccompletes++;  	return FLUSH_COMPLETE;  } -static inline cycles_t -sec_2_cycles(unsigned long sec) +/* + * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register. + * But not currently used. + */ +static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)  { -	unsigned long ns; -	cycles_t cyc; +	unsigned long descriptor_status; -	ns = sec * 1000000000; -	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); -	return cyc; +	descriptor_status = +		((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1; +	return descriptor_status;  }  /* - * conditionally add 1 to *v, unless *v is >= u - * return 0 if we cannot add 1 to *v because it is >= u - * return 1 if we can add 1 to *v because it is < u - * the add is atomic - * - * This is close to atomic_add_unless(), but this allows the 'u' value - * to be lowered below the current 'v'.  atomic_add_unless can only stop - * on equal. + * Return whether the status of the descriptor that is normally used for this + * cpu (the one indexed by its hub-relative cpu number) is busy. + * The status of the original 32 descriptors is always reflected in the 64 + * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0. + * The bit provided by the activation_status_2 register is irrelevant to + * the status if it is only being tested for busy or not busy.   */ -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) +int normal_busy(struct bau_control *bcp)  { -	spin_lock(lock); -	if (atomic_read(v) >= u) { -		spin_unlock(lock); -		return 0; +	int cpu = bcp->uvhub_cpu; +	int mmr_offset; +	int right_shift; + +	mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; +	right_shift = cpu * UV_ACT_STATUS_SIZE; +	return (((((read_lmmr(mmr_offset) >> right_shift) & +				UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY); +} + +/* + * Entered when a bau descriptor has gone into a permanent busy wait because + * of a hardware bug. + * Workaround the bug. + */ +int handle_uv2_busy(struct bau_control *bcp) +{ +	struct ptc_stats *stat = bcp->statp; + +	stat->s_uv2_wars++; +	bcp->busy = 1; +	return FLUSH_GIVEUP; +} + +static int uv2_wait_completion(struct bau_desc *bau_desc, +				unsigned long mmr_offset, int right_shift, +				struct bau_control *bcp, long try) +{ +	unsigned long descriptor_stat; +	cycles_t ttm; +	int desc = bcp->uvhub_cpu; +	long busy_reps = 0; +	struct ptc_stats *stat = bcp->statp; + +	descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc); + +	/* spin on the status MMR, waiting for it to go idle */ +	while (descriptor_stat != UV2H_DESC_IDLE) { +		if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) { +			/* +			 * A h/w bug on the destination side may +			 * have prevented the message being marked +			 * pending, thus it doesn't get replied to +			 * and gets continually nacked until it times +			 * out with a SOURCE_TIMEOUT. +			 */ +			stat->s_stimeout++; +			return FLUSH_GIVEUP; +		} else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { +			ttm = get_cycles(); + +			/* +			 * Our retries may be blocked by all destination +			 * swack resources being consumed, and a timeout +			 * pending.  In that case hardware returns the +			 * ERROR that looks like a destination timeout. +			 * Without using the extended status we have to +			 * deduce from the short time that this was a +			 * strong nack. +			 */ +			if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { +				bcp->conseccompletes = 0; +				stat->s_plugged++; +				/* FLUSH_RETRY_PLUGGED causes hang on boot */ +				return FLUSH_GIVEUP; +			} +			stat->s_dtimeout++; +			bcp->conseccompletes = 0; +			/* FLUSH_RETRY_TIMEOUT causes hang on boot */ +			return FLUSH_GIVEUP; +		} else { +			busy_reps++; +			if (busy_reps > 1000000) { +				/* not to hammer on the clock */ +				busy_reps = 0; +				ttm = get_cycles(); +				if ((ttm - bcp->send_message) > +						bcp->timeout_interval) +					return handle_uv2_busy(bcp); +			} +			/* +			 * descriptor_stat is still BUSY +			 */ +			cpu_relax(); +		} +		descriptor_stat = uv2_read_status(mmr_offset, right_shift, +									desc);  	} -	atomic_inc(v); -	spin_unlock(lock); -	return 1; +	bcp->conseccompletes++; +	return FLUSH_COMPLETE; +} + +/* + * There are 2 status registers; each and array[32] of 2 bits. Set up for + * which register to read and position in that register based on cpu in + * current hub. + */ +static int wait_completion(struct bau_desc *bau_desc, +				struct bau_control *bcp, long try) +{ +	int right_shift; +	unsigned long mmr_offset; +	int desc = bcp->uvhub_cpu; + +	if (desc < UV_CPUS_PER_AS) { +		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; +		right_shift = desc * UV_ACT_STATUS_SIZE; +	} else { +		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; +		right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); +	} + +	if (bcp->uvhub_version == 1) +		return uv1_wait_completion(bau_desc, mmr_offset, right_shift, +								bcp, try); +	else +		return uv2_wait_completion(bau_desc, mmr_offset, right_shift, +								bcp, try);  }  /* - * Our retries are blocked by all destination swack resources being + * Our retries are blocked by all destination sw ack resources being   * in use, and a timeout is pending. In that case hardware immediately   * returns the ERROR that looks like a destination timeout.   */ -static void -destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp, +static void destination_plugged(struct bau_desc *bau_desc, +			struct bau_control *bcp,  			struct bau_control *hmaster, struct ptc_stats *stat)  {  	udelay(bcp->plugged_delay);  	bcp->plugged_tries++; +  	if (bcp->plugged_tries >= bcp->plugsb4reset) {  		bcp->plugged_tries = 0; +  		quiesce_local_uvhub(hmaster); +  		spin_lock(&hmaster->queue_lock); -		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); +		reset_with_ipi(&bau_desc->distribution, bcp);  		spin_unlock(&hmaster->queue_lock); +  		end_uvhub_quiesce(hmaster); +  		bcp->ipi_attempts++;  		stat->s_resets_plug++;  	}  } -static void -destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp, -			struct bau_control *hmaster, struct ptc_stats *stat) +static void destination_timeout(struct bau_desc *bau_desc, +			struct bau_control *bcp, struct bau_control *hmaster, +			struct ptc_stats *stat)  { -	hmaster->max_bau_concurrent = 1; +	hmaster->max_concurr = 1;  	bcp->timeout_tries++;  	if (bcp->timeout_tries >= bcp->timeoutsb4reset) {  		bcp->timeout_tries = 0; +  		quiesce_local_uvhub(hmaster); +  		spin_lock(&hmaster->queue_lock); -		uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); +		reset_with_ipi(&bau_desc->distribution, bcp);  		spin_unlock(&hmaster->queue_lock); +  		end_uvhub_quiesce(hmaster); +  		bcp->ipi_attempts++;  		stat->s_resets_timeout++;  	}  }  /* - * Completions are taking a very long time due to a congested numalink - * network. + * Stop all cpus on a uvhub from using the BAU for a period of time. + * This is reversed by check_enable.   */ -static void -disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) +static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)  {  	int tcpu;  	struct bau_control *tbcp; +	struct bau_control *hmaster; +	cycles_t tm1; -	/* let only one cpu do this disabling */ -	spin_lock(&disable_lock); -	if (!baudisabled && bcp->period_requests && -	    ((bcp->period_time / bcp->period_requests) > congested_cycles)) { -		/* it becomes this cpu's job to turn on the use of the -		   BAU again */ -		baudisabled = 1; -		bcp->set_bau_off = 1; -		bcp->set_bau_on_time = get_cycles() + -			sec_2_cycles(bcp->congested_period); +	hmaster = bcp->uvhub_master; +	spin_lock(&hmaster->disable_lock); +	if (!bcp->baudisabled) {  		stat->s_bau_disabled++; +		tm1 = get_cycles();  		for_each_present_cpu(tcpu) {  			tbcp = &per_cpu(bau_control, tcpu); +			if (tbcp->uvhub_master == hmaster) {  				tbcp->baudisabled = 1; +				tbcp->set_bau_on_time = +					tm1 + bcp->disabled_period; +			}  		}  	} -	spin_unlock(&disable_lock); +	spin_unlock(&hmaster->disable_lock);  } -/** - * uv_flush_send_and_wait - * +static void count_max_concurr(int stat, struct bau_control *bcp, +				struct bau_control *hmaster) +{ +	bcp->plugged_tries = 0; +	bcp->timeout_tries = 0; +	if (stat != FLUSH_COMPLETE) +		return; +	if (bcp->conseccompletes <= bcp->complete_threshold) +		return; +	if (hmaster->max_concurr >= hmaster->max_concurr_const) +		return; +	hmaster->max_concurr++; +} + +static void record_send_stats(cycles_t time1, cycles_t time2, +		struct bau_control *bcp, struct ptc_stats *stat, +		int completion_status, int try) +{ +	cycles_t elapsed; + +	if (time2 > time1) { +		elapsed = time2 - time1; +		stat->s_time += elapsed; + +		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { +			bcp->period_requests++; +			bcp->period_time += elapsed; +			if ((elapsed > congested_cycles) && +			    (bcp->period_requests > bcp->cong_reps) && +			    ((bcp->period_time / bcp->period_requests) > +							congested_cycles)) { +				stat->s_congested++; +				disable_for_period(bcp, stat); +			} +		} +	} else +		stat->s_requestor--; + +	if (completion_status == FLUSH_COMPLETE && try > 1) +		stat->s_retriesok++; +	else if (completion_status == FLUSH_GIVEUP) { +		stat->s_giveup++; +		if (get_cycles() > bcp->period_end) +			bcp->period_giveups = 0; +		bcp->period_giveups++; +		if (bcp->period_giveups == 1) +			bcp->period_end = get_cycles() + bcp->disabled_period; +		if (bcp->period_giveups > bcp->giveup_limit) { +			disable_for_period(bcp, stat); +			stat->s_giveuplimit++; +		} +	} +} + +/* + * Because of a uv1 hardware bug only a limited number of concurrent + * requests can be made. + */ +static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat) +{ +	spinlock_t *lock = &hmaster->uvhub_lock; +	atomic_t *v; + +	v = &hmaster->active_descriptor_count; +	if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) { +		stat->s_throttles++; +		do { +			cpu_relax(); +		} while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)); +	} +} + +/* + * Handle the completion status of a message send. + */ +static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, +			struct bau_control *bcp, struct bau_control *hmaster, +			struct ptc_stats *stat) +{ +	if (completion_status == FLUSH_RETRY_PLUGGED) +		destination_plugged(bau_desc, bcp, hmaster, stat); +	else if (completion_status == FLUSH_RETRY_TIMEOUT) +		destination_timeout(bau_desc, bcp, hmaster, stat); +} + +/*   * Send a broadcast and wait for it to complete.   *   * The flush_mask contains the cpus the broadcast is to be sent to including @@ -566,115 +875,191 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)   * Returns 1 if it gives up entirely and the original cpu mask is to be   * returned to the kernel.   */ -int uv_flush_send_and_wait(struct bau_desc *bau_desc, -			   struct cpumask *flush_mask, struct bau_control *bcp) +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, +	struct bau_desc *bau_desc)  { -	int right_shift; -	int completion_status = 0;  	int seq_number = 0; +	int completion_stat = 0; +	int uv1 = 0;  	long try = 0; -	int cpu = bcp->uvhub_cpu; -	int this_cpu = bcp->cpu; -	unsigned long mmr_offset;  	unsigned long index;  	cycles_t time1;  	cycles_t time2; -	cycles_t elapsed;  	struct ptc_stats *stat = bcp->statp; -	struct bau_control *smaster = bcp->socket_master;  	struct bau_control *hmaster = bcp->uvhub_master; +	struct uv1_bau_msg_header *uv1_hdr = NULL; +	struct uv2_bau_msg_header *uv2_hdr = NULL; -	if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, -			&hmaster->active_descriptor_count, -			hmaster->max_bau_concurrent)) { -		stat->s_throttles++; -		do { -			cpu_relax(); -		} while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, -			&hmaster->active_descriptor_count, -			hmaster->max_bau_concurrent)); +	if (bcp->uvhub_version == 1) { +		uv1 = 1; +		uv1_throttle(hmaster, stat);  	} +  	while (hmaster->uvhub_quiesce)  		cpu_relax(); -	if (cpu < UV_CPUS_PER_ACT_STATUS) { -		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; -		right_shift = cpu * UV_ACT_STATUS_SIZE; -	} else { -		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; -		right_shift = -		    ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE); -	}  	time1 = get_cycles(); +	if (uv1) +		uv1_hdr = &bau_desc->header.uv1_hdr; +	else +		uv2_hdr = &bau_desc->header.uv2_hdr; +  	do {  		if (try == 0) { -			bau_desc->header.msg_type = MSG_REGULAR; +			if (uv1) +				uv1_hdr->msg_type = MSG_REGULAR; +			else +				uv2_hdr->msg_type = MSG_REGULAR;  			seq_number = bcp->message_number++;  		} else { -			bau_desc->header.msg_type = MSG_RETRY; +			if (uv1) +				uv1_hdr->msg_type = MSG_RETRY; +			else +				uv2_hdr->msg_type = MSG_RETRY;  			stat->s_retry_messages++;  		} -		bau_desc->header.sequence = seq_number; -		index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | -			bcp->uvhub_cpu; + +		if (uv1) +			uv1_hdr->sequence = seq_number; +		else +			uv2_hdr->sequence = seq_number; +		index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;  		bcp->send_message = get_cycles(); -		uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); + +		write_mmr_activation(index); +  		try++; -		completion_status = uv_wait_completion(bau_desc, mmr_offset, -			right_shift, this_cpu, bcp, smaster, try); +		completion_stat = wait_completion(bau_desc, bcp, try); + +		handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); -		if (completion_status == FLUSH_RETRY_PLUGGED) { -			destination_plugged(bau_desc, bcp, hmaster, stat); -		} else if (completion_status == FLUSH_RETRY_TIMEOUT) { -			destination_timeout(bau_desc, bcp, hmaster, stat); -		}  		if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {  			bcp->ipi_attempts = 0; -			completion_status = FLUSH_GIVEUP; +			stat->s_overipilimit++; +			completion_stat = FLUSH_GIVEUP;  			break;  		}  		cpu_relax(); -	} while ((completion_status == FLUSH_RETRY_PLUGGED) || -		 (completion_status == FLUSH_RETRY_TIMEOUT)); +	} while ((completion_stat == FLUSH_RETRY_PLUGGED) || +		 (completion_stat == FLUSH_RETRY_TIMEOUT)); +  	time2 = get_cycles(); -	bcp->plugged_tries = 0; -	bcp->timeout_tries = 0; -	if ((completion_status == FLUSH_COMPLETE) && -	    (bcp->conseccompletes > bcp->complete_threshold) && -	    (hmaster->max_bau_concurrent < -					hmaster->max_bau_concurrent_constant)) -			hmaster->max_bau_concurrent++; + +	count_max_concurr(completion_stat, bcp, hmaster); +  	while (hmaster->uvhub_quiesce)  		cpu_relax(); +  	atomic_dec(&hmaster->active_descriptor_count); -	if (time2 > time1) { -		elapsed = time2 - time1; -		stat->s_time += elapsed; -		if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { -			bcp->period_requests++; -			bcp->period_time += elapsed; -			if ((elapsed > congested_cycles) && -			    (bcp->period_requests > bcp->congested_reps)) { -				disable_for_congestion(bcp, stat); + +	record_send_stats(time1, time2, bcp, stat, completion_stat, try); + +	if (completion_stat == FLUSH_GIVEUP) +		/* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */ +		return 1; +	return 0; +} + +/* + * The BAU is disabled for this uvhub. When the disabled time period has + * expired re-enable it. + * Return 0 if it is re-enabled for all cpus on this uvhub. + */ +static int check_enable(struct bau_control *bcp, struct ptc_stats *stat) +{ +	int tcpu; +	struct bau_control *tbcp; +	struct bau_control *hmaster; + +	hmaster = bcp->uvhub_master; +	spin_lock(&hmaster->disable_lock); +	if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) { +		stat->s_bau_reenabled++; +		for_each_present_cpu(tcpu) { +			tbcp = &per_cpu(bau_control, tcpu); +			if (tbcp->uvhub_master == hmaster) { +				tbcp->baudisabled = 0; +				tbcp->period_requests = 0; +				tbcp->period_time = 0; +				tbcp->period_giveups = 0;  			}  		} +		spin_unlock(&hmaster->disable_lock); +		return 0; +	} +	spin_unlock(&hmaster->disable_lock); +	return -1; +} + +static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs, +				int remotes, struct bau_desc *bau_desc) +{ +	stat->s_requestor++; +	stat->s_ntargcpu += remotes + locals; +	stat->s_ntargremotes += remotes; +	stat->s_ntarglocals += locals; + +	/* uvhub statistics */ +	hubs = bau_uvhub_weight(&bau_desc->distribution); +	if (locals) { +		stat->s_ntarglocaluvhub++; +		stat->s_ntargremoteuvhub += (hubs - 1);  	} else -		stat->s_requestor--; -	if (completion_status == FLUSH_COMPLETE && try > 1) -		stat->s_retriesok++; -	else if (completion_status == FLUSH_GIVEUP) { -		stat->s_giveup++; -		return 1; +		stat->s_ntargremoteuvhub += hubs; + +	stat->s_ntarguvhub += hubs; + +	if (hubs >= 16) +		stat->s_ntarguvhub16++; +	else if (hubs >= 8) +		stat->s_ntarguvhub8++; +	else if (hubs >= 4) +		stat->s_ntarguvhub4++; +	else if (hubs >= 2) +		stat->s_ntarguvhub2++; +	else +		stat->s_ntarguvhub1++; +} + +/* + * Translate a cpu mask to the uvhub distribution mask in the BAU + * activation descriptor. + */ +static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, +			struct bau_desc *bau_desc, int *localsp, int *remotesp) +{ +	int cpu; +	int pnode; +	int cnt = 0; +	struct hub_and_pnode *hpp; + +	for_each_cpu(cpu, flush_mask) { +		/* +		 * The distribution vector is a bit map of pnodes, relative +		 * to the partition base pnode (and the partition base nasid +		 * in the header). +		 * Translate cpu to pnode and hub using a local memory array. +		 */ +		hpp = &bcp->socket_master->thp[cpu]; +		pnode = hpp->pnode - bcp->partition_base_pnode; +		bau_uvhub_set(pnode, &bau_desc->distribution); +		cnt++; +		if (hpp->uvhub == bcp->uvhub) +			(*localsp)++; +		else +			(*remotesp)++;  	} +	if (!cnt) +		return 1;  	return 0;  } -/** - * uv_flush_tlb_others - globally purge translation cache of a virtual - * address or all TLB's +/* + * globally purge translation cache of a virtual address or all TLB's   * @cpumask: mask of all cpu's in which the address is to be removed   * @mm: mm_struct containing virtual address range - * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) + * @start: start virtual address to be removed from TLB + * @end: end virtual address to be remove from TLB   * @cpu: the current cpu   *   * This is the entry point for initiating any UV global TLB shootdown. @@ -695,11 +1080,9 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,   * done.  The returned pointer is valid till preemption is re-enabled.   */  const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, -					  struct mm_struct *mm, -					  unsigned long va, unsigned int cpu) +				struct mm_struct *mm, unsigned long start, +				unsigned long end, unsigned int cpu)  { -	int tcpu; -	int uvhub;  	int locals = 0;  	int remotes = 0;  	int hubs = 0; @@ -707,31 +1090,33 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,  	struct cpumask *flush_mask;  	struct ptc_stats *stat;  	struct bau_control *bcp; -	struct bau_control *tbcp; +	unsigned long descriptor_status; +	unsigned long status; + +	bcp = &per_cpu(bau_control, cpu); -	/* kernel was booted 'nobau' */ -	if (nobau) +	if (bcp->nobau)  		return cpumask; -	bcp = &per_cpu(bau_control, cpu);  	stat = bcp->statp; +	stat->s_enters++; + +	if (bcp->busy) { +		descriptor_status = +			read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0); +		status = ((descriptor_status >> (bcp->uvhub_cpu * +			UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1; +		if (status == UV2H_DESC_BUSY) +			return cpumask; +		bcp->busy = 0; +	}  	/* bau was disabled due to slow response */  	if (bcp->baudisabled) { -		/* the cpu that disabled it must re-enable it */ -		if (bcp->set_bau_off) { -			if (get_cycles() >= bcp->set_bau_on_time) { -				stat->s_bau_reenabled++; -				baudisabled = 0; -				for_each_present_cpu(tcpu) { -					tbcp = &per_cpu(bau_control, tcpu); -					tbcp->baudisabled = 0; -					tbcp->period_requests = 0; -					tbcp->period_time = 0; -				} -			} +		if (check_enable(bcp, stat)) { +			stat->s_ipifordisabled++; +			return cpumask;  		} -		return cpumask;  	}  	/* @@ -742,63 +1127,107 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,  	flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);  	/* don't actually do a shootdown of the local cpu */  	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); +  	if (cpu_isset(cpu, *cpumask))  		stat->s_ntargself++;  	bau_desc = bcp->descriptor_base; -	bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; +	bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);  	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); - -	/* cpu statistics */ -	for_each_cpu(tcpu, flush_mask) { -		uvhub = uv_cpu_to_blade_id(tcpu); -		bau_uvhub_set(uvhub, &bau_desc->distribution); -		if (uvhub == bcp->uvhub) -			locals++; -		else -			remotes++; -	} -	if ((locals + remotes) == 0) +	if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))  		return NULL; -	stat->s_requestor++; -	stat->s_ntargcpu += remotes + locals; -	stat->s_ntargremotes += remotes; -	stat->s_ntarglocals += locals; -	remotes = bau_uvhub_weight(&bau_desc->distribution); -	/* uvhub statistics */ -	hubs = bau_uvhub_weight(&bau_desc->distribution); -	if (locals) { -		stat->s_ntarglocaluvhub++; -		stat->s_ntargremoteuvhub += (hubs - 1); -	} else -		stat->s_ntargremoteuvhub += hubs; -	stat->s_ntarguvhub += hubs; -	if (hubs >= 16) -		stat->s_ntarguvhub16++; -	else if (hubs >= 8) -		stat->s_ntarguvhub8++; -	else if (hubs >= 4) -		stat->s_ntarguvhub4++; -	else if (hubs >= 2) -		stat->s_ntarguvhub2++; -	else -		stat->s_ntarguvhub1++; +	record_send_statistics(stat, locals, hubs, remotes, bau_desc); -	bau_desc->payload.address = va; +	if (!end || (end - start) <= PAGE_SIZE) +		bau_desc->payload.address = start; +	else +		bau_desc->payload.address = TLB_FLUSH_ALL;  	bau_desc->payload.sending_cpu = cpu; -  	/*  	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,  	 * or 1 if it gave up and the original cpumask should be returned.  	 */ -	if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) +	if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc))  		return NULL;  	else  		return cpumask;  }  /* + * Search the message queue for any 'other' unprocessed message with the + * same software acknowledge resource bit vector as the 'msg' message. + */ +struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, +					   struct bau_control *bcp) +{ +	struct bau_pq_entry *msg_next = msg + 1; +	unsigned char swack_vec = msg->swack_vec; + +	if (msg_next > bcp->queue_last) +		msg_next = bcp->queue_first; +	while (msg_next != msg) { +		if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) && +				(msg_next->swack_vec == swack_vec)) +			return msg_next; +		msg_next++; +		if (msg_next > bcp->queue_last) +			msg_next = bcp->queue_first; +	} +	return NULL; +} + +/* + * UV2 needs to work around a bug in which an arriving message has not + * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. + * Such a message must be ignored. + */ +void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) +{ +	unsigned long mmr_image; +	unsigned char swack_vec; +	struct bau_pq_entry *msg = mdp->msg; +	struct bau_pq_entry *other_msg; + +	mmr_image = read_mmr_sw_ack(); +	swack_vec = msg->swack_vec; + +	if ((swack_vec & mmr_image) == 0) { +		/* +		 * This message was assigned a swack resource, but no +		 * reserved acknowlegment is pending. +		 * The bug has prevented this message from setting the MMR. +		 */ +		/* +		 * Some message has set the MMR 'pending' bit; it might have +		 * been another message.  Look for that message. +		 */ +		other_msg = find_another_by_swack(msg, bcp); +		if (other_msg) { +			/* +			 * There is another. Process this one but do not +			 * ack it. +			 */ +			bau_process_message(mdp, bcp, 0); +			/* +			 * Let the natural processing of that other message +			 * acknowledge it. Don't get the processing of sw_ack's +			 * out of order. +			 */ +			return; +		} +	} + +	/* +	 * Either the MMR shows this one pending a reply or there is no +	 * other message using this sw_ack, so it is safe to acknowledge it. +	 */ +	bau_process_message(mdp, bcp, 1); + +	return; +} + +/*   * The BAU message interrupt comes here. (registered by set_intr_gate)   * See entry_64.S   * @@ -816,26 +1245,34 @@ void uv_bau_message_interrupt(struct pt_regs *regs)  {  	int count = 0;  	cycles_t time_start; -	struct bau_payload_queue_entry *msg; +	struct bau_pq_entry *msg;  	struct bau_control *bcp;  	struct ptc_stats *stat;  	struct msg_desc msgdesc; +	ack_APIC_irq();  	time_start = get_cycles(); +  	bcp = &per_cpu(bau_control, smp_processor_id());  	stat = bcp->statp; -	msgdesc.va_queue_first = bcp->va_queue_first; -	msgdesc.va_queue_last = bcp->va_queue_last; + +	msgdesc.queue_first = bcp->queue_first; +	msgdesc.queue_last = bcp->queue_last; +  	msg = bcp->bau_msg_head; -	while (msg->sw_ack_vector) { +	while (msg->swack_vec) {  		count++; -		msgdesc.msg_slot = msg - msgdesc.va_queue_first; -		msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1; + +		msgdesc.msg_slot = msg - msgdesc.queue_first;  		msgdesc.msg = msg; -		uv_bau_process_message(&msgdesc, bcp); +		if (bcp->uvhub_version == 2) +			process_uv2_message(&msgdesc, bcp); +		else +			bau_process_message(&msgdesc, bcp, 1); +  		msg++; -		if (msg > msgdesc.va_queue_last) -			msg = msgdesc.va_queue_first; +		if (msg > msgdesc.queue_last) +			msg = msgdesc.queue_first;  		bcp->bau_msg_head = msg;  	}  	stat->d_time += (get_cycles() - time_start); @@ -843,18 +1280,15 @@ void uv_bau_message_interrupt(struct pt_regs *regs)  		stat->d_nomsg++;  	else if (count > 1)  		stat->d_multmsg++; -	ack_APIC_irq();  }  /* - * uv_enable_timeouts - * - * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have + * Each target uvhub (i.e. a uvhub that has cpu's) needs to have   * shootdown message timeouts enabled.  The timeout does not cause   * an interrupt, but causes an error message to be returned to   * the sender.   */ -static void uv_enable_timeouts(void) +static void __init enable_timeouts(void)  {  	int uvhub;  	int nuvhubs; @@ -868,47 +1302,44 @@ static void uv_enable_timeouts(void)  			continue;  		pnode = uv_blade_to_pnode(uvhub); -		mmr_image = -		    uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); +		mmr_image = read_mmr_misc_control(pnode);  		/*  		 * Set the timeout period and then lock it in, in three  		 * steps; captures and locks in the period.  		 *  		 * To program the period, the SOFT_ACK_MODE must be off.  		 */ -		mmr_image &= ~((unsigned long)1 << -		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); -		uv_write_global_mmr64 -		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); +		mmr_image &= ~(1L << SOFTACK_MSHIFT); +		write_mmr_misc_control(pnode, mmr_image);  		/*  		 * Set the 4-bit period.  		 */ -		mmr_image &= ~((unsigned long)0xf << -		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); -		mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD << -		     UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); -		uv_write_global_mmr64 -		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); +		mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT); +		mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT); +		write_mmr_misc_control(pnode, mmr_image);  		/* +		 * UV1:  		 * Subsequent reversals of the timebase bit (3) cause an  		 * immediate timeout of one or all INTD resources as  		 * indicated in bits 2:0 (7 causes all of them to timeout).  		 */ -		mmr_image |= ((unsigned long)1 << -		    UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); -		uv_write_global_mmr64 -		    (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); +		mmr_image |= (1L << SOFTACK_MSHIFT); +		if (is_uv2_hub()) { +			/* hw bug workaround; do not use extended status */ +			mmr_image &= ~(1L << UV2_EXT_SHFT); +		} +		write_mmr_misc_control(pnode, mmr_image);  	}  } -static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset) +static void *ptc_seq_start(struct seq_file *file, loff_t *offset)  {  	if (*offset < num_possible_cpus())  		return offset;  	return NULL;  } -static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) +static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)  {  	(*offset)++;  	if (*offset < num_possible_cpus()) @@ -916,82 +1347,77 @@ static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)  	return NULL;  } -static void uv_ptc_seq_stop(struct seq_file *file, void *data) -{ -} - -static inline unsigned long long -microsec_2_cycles(unsigned long microsec) +static void ptc_seq_stop(struct seq_file *file, void *data)  { -	unsigned long ns; -	unsigned long long cyc; - -	ns = microsec * 1000; -	cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); -	return cyc;  }  /* - * Display the statistics thru /proc. + * Display the statistics thru /proc/sgi_uv/ptc_statistics   * 'data' points to the cpu number + * Note: see the descriptions in stat_description[].   */ -static int uv_ptc_seq_show(struct seq_file *file, void *data) +static int ptc_seq_show(struct seq_file *file, void *data)  {  	struct ptc_stats *stat; +	struct bau_control *bcp;  	int cpu;  	cpu = *(loff_t *)data; -  	if (!cpu) {  		seq_printf(file, -			"# cpu sent stime self locals remotes ncpus localhub "); +		 "# cpu bauoff sent stime self locals remotes ncpus localhub ");  		seq_printf(file,  			"remotehub numuvhubs numuvhubs16 numuvhubs8 ");  		seq_printf(file, -			"numuvhubs4 numuvhubs2 numuvhubs1 dto "); +			"numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries ");  		seq_printf(file, -			"retries rok resetp resett giveup sto bz throt "); +			"rok resetp resett giveup sto bz throt disable ");  		seq_printf(file, -			"sw_ack recv rtime all "); +			"enable wars warshw warwaits enters ipidis plugged ");  		seq_printf(file, -			"one mult none retry canc nocan reset rcan "); +			"ipiover glim cong swack recv rtime all one mult ");  		seq_printf(file, -			"disable enable\n"); +			"none retry canc nocan reset rcan\n");  	}  	if (cpu < num_possible_cpus() && cpu_online(cpu)) { -		stat = &per_cpu(ptcstats, cpu); +		bcp = &per_cpu(bau_control, cpu); +		stat = bcp->statp;  		/* source side statistics */  		seq_printf(file, -			"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", -			   cpu, stat->s_requestor, cycles_2_us(stat->s_time), +			"cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", +			   cpu, bcp->nobau, stat->s_requestor, +			   cycles_2_us(stat->s_time),  			   stat->s_ntargself, stat->s_ntarglocals,  			   stat->s_ntargremotes, stat->s_ntargcpu,  			   stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,  			   stat->s_ntarguvhub, stat->s_ntarguvhub16); -		seq_printf(file, "%ld %ld %ld %ld %ld ", +		seq_printf(file, "%ld %ld %ld %ld %ld %ld ",  			   stat->s_ntarguvhub8, stat->s_ntarguvhub4,  			   stat->s_ntarguvhub2, stat->s_ntarguvhub1, -			   stat->s_dtimeout); +			   stat->s_dtimeout, stat->s_strongnacks);  		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",  			   stat->s_retry_messages, stat->s_retriesok,  			   stat->s_resets_plug, stat->s_resets_timeout,  			   stat->s_giveup, stat->s_stimeout,  			   stat->s_busy, stat->s_throttles); +		seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", +			   stat->s_bau_disabled, stat->s_bau_reenabled, +			   stat->s_uv2_wars, stat->s_uv2_wars_hw, +			   stat->s_uv2_war_waits, stat->s_enters, +			   stat->s_ipifordisabled, stat->s_plugged, +			   stat->s_overipilimit, stat->s_giveuplimit, +			   stat->s_congested);  		/* destination side statistics */  		seq_printf(file, -			   "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", -			   uv_read_global_mmr64(uv_cpu_to_pnode(cpu), -					UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), +			"%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", +			   read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),  			   stat->d_requestee, cycles_2_us(stat->d_time),  			   stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,  			   stat->d_nomsg, stat->d_retries, stat->d_canceled,  			   stat->d_nocanceled, stat->d_resets,  			   stat->d_rcanceled); -		seq_printf(file, "%ld %ld\n", -			stat->s_bau_disabled, stat->s_bau_reenabled);  	} -  	return 0;  } @@ -999,18 +1425,19 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)   * Display the tunables thru debugfs   */  static ssize_t tunables_read(struct file *file, char __user *userbuf, -						size_t count, loff_t *ppos) +				size_t count, loff_t *ppos)  {  	char *buf;  	int ret; -	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", -		"max_bau_concurrent plugged_delay plugsb4reset", -		"timeoutsb4reset ipi_reset_limit complete_threshold", -		"congested_response_us congested_reps congested_period", -		max_bau_concurrent, plugged_delay, plugsb4reset, +	buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n", +		"max_concur plugged_delay plugsb4reset timeoutsb4reset", +		"ipi_reset_limit complete_threshold congested_response_us", +		"congested_reps disabled_period giveup_limit", +		max_concurr, plugged_delay, plugsb4reset,  		timeoutsb4reset, ipi_reset_limit, complete_threshold, -		congested_response_us, congested_reps, congested_period); +		congested_respns_us, congested_reps, disabled_period, +		giveup_limit);  	if (!buf)  		return -ENOMEM; @@ -1021,13 +1448,16 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf,  }  /* - * -1: resetf the statistics + * handle a write to /proc/sgi_uv/ptc_statistics + * -1: reset the statistics   *  0: display meaning of the statistics   */ -static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, -				 size_t count, loff_t *data) +static ssize_t ptc_proc_write(struct file *file, const char __user *user, +				size_t count, loff_t *data)  {  	int cpu; +	int i; +	int elements;  	long input_arg;  	char optstr[64];  	struct ptc_stats *stat; @@ -1037,79 +1467,26 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,  	if (copy_from_user(optstr, user, count))  		return -EFAULT;  	optstr[count - 1] = '\0'; + +	if (!strcmp(optstr, "on")) { +		set_bau_on(); +		return count; +	} else if (!strcmp(optstr, "off")) { +		set_bau_off(); +		return count; +	} +  	if (strict_strtol(optstr, 10, &input_arg) < 0) {  		printk(KERN_DEBUG "%s is invalid\n", optstr);  		return -EINVAL;  	}  	if (input_arg == 0) { +		elements = ARRAY_SIZE(stat_description);  		printk(KERN_DEBUG "# cpu:      cpu number\n");  		printk(KERN_DEBUG "Sender statistics:\n"); -		printk(KERN_DEBUG -		"sent:     number of shootdown messages sent\n"); -		printk(KERN_DEBUG -		"stime:    time spent sending messages\n"); -		printk(KERN_DEBUG -		"numuvhubs: number of hubs targeted with shootdown\n"); -		printk(KERN_DEBUG -		"numuvhubs16: number times 16 or more hubs targeted\n"); -		printk(KERN_DEBUG -		"numuvhubs8: number times 8 or more hubs targeted\n"); -		printk(KERN_DEBUG -		"numuvhubs4: number times 4 or more hubs targeted\n"); -		printk(KERN_DEBUG -		"numuvhubs2: number times 2 or more hubs targeted\n"); -		printk(KERN_DEBUG -		"numuvhubs1: number times 1 hub targeted\n"); -		printk(KERN_DEBUG -		"numcpus:  number of cpus targeted with shootdown\n"); -		printk(KERN_DEBUG -		"dto:      number of destination timeouts\n"); -		printk(KERN_DEBUG -		"retries:  destination timeout retries sent\n"); -		printk(KERN_DEBUG -		"rok:   :  destination timeouts successfully retried\n"); -		printk(KERN_DEBUG -		"resetp:   ipi-style resource resets for plugs\n"); -		printk(KERN_DEBUG -		"resett:   ipi-style resource resets for timeouts\n"); -		printk(KERN_DEBUG -		"giveup:   fall-backs to ipi-style shootdowns\n"); -		printk(KERN_DEBUG -		"sto:      number of source timeouts\n"); -		printk(KERN_DEBUG -		"bz:       number of stay-busy's\n"); -		printk(KERN_DEBUG -		"throt:    number times spun in throttle\n"); -		printk(KERN_DEBUG "Destination side statistics:\n"); -		printk(KERN_DEBUG -		"sw_ack:   image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); -		printk(KERN_DEBUG -		"recv:     shootdown messages received\n"); -		printk(KERN_DEBUG -		"rtime:    time spent processing messages\n"); -		printk(KERN_DEBUG -		"all:      shootdown all-tlb messages\n"); -		printk(KERN_DEBUG -		"one:      shootdown one-tlb messages\n"); -		printk(KERN_DEBUG -		"mult:     interrupts that found multiple messages\n"); -		printk(KERN_DEBUG -		"none:     interrupts that found no messages\n"); -		printk(KERN_DEBUG -		"retry:    number of retry messages processed\n"); -		printk(KERN_DEBUG -		"canc:     number messages canceled by retries\n"); -		printk(KERN_DEBUG -		"nocan:    number retries that found nothing to cancel\n"); -		printk(KERN_DEBUG -		"reset:    number of ipi-style reset requests processed\n"); -		printk(KERN_DEBUG -		"rcan:     number messages canceled by reset requests\n"); -		printk(KERN_DEBUG -		"disable:  number times use of the BAU was disabled\n"); -		printk(KERN_DEBUG -		"enable:   number times use of the BAU was re-enabled\n"); +		for (i = 0; i < elements; i++) +			printk(KERN_DEBUG "%s\n", stat_description[i]);  	} else if (input_arg == -1) {  		for_each_present_cpu(cpu) {  			stat = &per_cpu(ptcstats, cpu); @@ -1136,27 +1513,18 @@ static int local_atoi(const char *name)  }  /* - * set the tunables - * 0 values reset them to defaults + * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables. + * Zero values reset them to defaults.   */ -static ssize_t tunables_write(struct file *file, const char __user *user, -				 size_t count, loff_t *data) +static int parse_tunables_write(struct bau_control *bcp, char *instr, +				int count)  { -	int cpu; -	int cnt = 0; -	int val;  	char *p;  	char *q; -	char instr[64]; -	struct bau_control *bcp; - -	if (count == 0 || count > sizeof(instr)-1) -		return -EINVAL; -	if (copy_from_user(instr, user, count)) -		return -EFAULT; +	int cnt = 0; +	int val; +	int e = ARRAY_SIZE(tunables); -	instr[count] = '\0'; -	/* count the fields */  	p = instr + strspn(instr, WHITESPACE);  	q = p;  	for (; *p; p = q + strspn(q, WHITESPACE)) { @@ -1165,8 +1533,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user,  		if (q == p)  			break;  	} -	if (cnt != 9) { -		printk(KERN_INFO "bau tunable error: should be 9 numbers\n"); +	if (cnt != e) { +		printk(KERN_INFO "bau tunable error: should be %d values\n", e);  		return -EINVAL;  	} @@ -1178,97 +1546,82 @@ static ssize_t tunables_write(struct file *file, const char __user *user,  		switch (cnt) {  		case 0:  			if (val == 0) { -				max_bau_concurrent = MAX_BAU_CONCURRENT; -				max_bau_concurrent_constant = -							MAX_BAU_CONCURRENT; +				max_concurr = MAX_BAU_CONCURRENT; +				max_concurr_const = MAX_BAU_CONCURRENT;  				continue;  			} -			bcp = &per_cpu(bau_control, smp_processor_id());  			if (val < 1 || val > bcp->cpus_in_uvhub) {  				printk(KERN_DEBUG  				"Error: BAU max concurrent %d is invalid\n",  				val);  				return -EINVAL;  			} -			max_bau_concurrent = val; -			max_bau_concurrent_constant = val; -			continue; -		case 1: -			if (val == 0) -				plugged_delay = PLUGGED_DELAY; -			else -				plugged_delay = val; -			continue; -		case 2: -			if (val == 0) -				plugsb4reset = PLUGSB4RESET; -			else -				plugsb4reset = val; -			continue; -		case 3: -			if (val == 0) -				timeoutsb4reset = TIMEOUTSB4RESET; -			else -				timeoutsb4reset = val; -			continue; -		case 4: -			if (val == 0) -				ipi_reset_limit = IPI_RESET_LIMIT; -			else -				ipi_reset_limit = val; +			max_concurr = val; +			max_concurr_const = val;  			continue; -		case 5: -			if (val == 0) -				complete_threshold = COMPLETE_THRESHOLD; -			else -				complete_threshold = val; -			continue; -		case 6: -			if (val == 0) -				congested_response_us = CONGESTED_RESPONSE_US; -			else -				congested_response_us = val; -			continue; -		case 7: -			if (val == 0) -				congested_reps = CONGESTED_REPS; -			else -				congested_reps = val; -			continue; -		case 8: +		default:  			if (val == 0) -				congested_period = CONGESTED_PERIOD; +				*tunables[cnt].tunp = tunables[cnt].deflt;  			else -				congested_period = val; +				*tunables[cnt].tunp = val;  			continue;  		}  		if (q == p)  			break;  	} +	return 0; +} + +/* + * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables) + */ +static ssize_t tunables_write(struct file *file, const char __user *user, +				size_t count, loff_t *data) +{ +	int cpu; +	int ret; +	char instr[100]; +	struct bau_control *bcp; + +	if (count == 0 || count > sizeof(instr)-1) +		return -EINVAL; +	if (copy_from_user(instr, user, count)) +		return -EFAULT; + +	instr[count] = '\0'; + +	cpu = get_cpu(); +	bcp = &per_cpu(bau_control, cpu); +	ret = parse_tunables_write(bcp, instr, count); +	put_cpu(); +	if (ret) +		return ret; +  	for_each_present_cpu(cpu) {  		bcp = &per_cpu(bau_control, cpu); -		bcp->max_bau_concurrent = max_bau_concurrent; -		bcp->max_bau_concurrent_constant = max_bau_concurrent; -		bcp->plugged_delay = plugged_delay; -		bcp->plugsb4reset = plugsb4reset; -		bcp->timeoutsb4reset = timeoutsb4reset; -		bcp->ipi_reset_limit = ipi_reset_limit; -		bcp->complete_threshold = complete_threshold; -		bcp->congested_response_us = congested_response_us; -		bcp->congested_reps = congested_reps; -		bcp->congested_period = congested_period; +		bcp->max_concurr =		max_concurr; +		bcp->max_concurr_const =	max_concurr; +		bcp->plugged_delay =		plugged_delay; +		bcp->plugsb4reset =		plugsb4reset; +		bcp->timeoutsb4reset =		timeoutsb4reset; +		bcp->ipi_reset_limit =		ipi_reset_limit; +		bcp->complete_threshold =	complete_threshold; +		bcp->cong_response_us =		congested_respns_us; +		bcp->cong_reps =		congested_reps; +		bcp->disabled_period =		sec_2_cycles(disabled_period); +		bcp->giveup_limit =		giveup_limit;  	}  	return count;  }  static const struct seq_operations uv_ptc_seq_ops = { -	.start		= uv_ptc_seq_start, -	.next		= uv_ptc_seq_next, -	.stop		= uv_ptc_seq_stop, -	.show		= uv_ptc_seq_show +	.start		= ptc_seq_start, +	.next		= ptc_seq_next, +	.stop		= ptc_seq_stop, +	.show		= ptc_seq_show  }; -static int uv_ptc_proc_open(struct inode *inode, struct file *file) +static int ptc_proc_open(struct inode *inode, struct file *file)  {  	return seq_open(file, &uv_ptc_seq_ops);  } @@ -1279,9 +1632,9 @@ static int tunables_open(struct inode *inode, struct file *file)  }  static const struct file_operations proc_uv_ptc_operations = { -	.open		= uv_ptc_proc_open, +	.open		= ptc_proc_open,  	.read		= seq_read, -	.write		= uv_ptc_proc_write, +	.write		= ptc_proc_write,  	.llseek		= seq_lseek,  	.release	= seq_release,  }; @@ -1315,7 +1668,7 @@ static int __init uv_ptc_init(void)  		return -EINVAL;  	}  	tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, -			tunables_dir, NULL, &tunables_fops); +					tunables_dir, NULL, &tunables_fops);  	if (!tunables_file) {  		printk(KERN_ERR "unable to create debugfs file %s\n",  		       UV_BAU_TUNABLES_FILE); @@ -1325,57 +1678,77 @@ static int __init uv_ptc_init(void)  }  /* - * initialize the sending side's sending buffers + * Initialize the sending side's sending buffers.   */ -static void -uv_activation_descriptor_init(int node, int pnode) +static void activation_descriptor_init(int node, int pnode, int base_pnode)  {  	int i;  	int cpu; -	unsigned long pa; +	int uv1 = 0; +	unsigned long gpa;  	unsigned long m;  	unsigned long n; +	size_t dsize;  	struct bau_desc *bau_desc;  	struct bau_desc *bd2; +	struct uv1_bau_msg_header *uv1_hdr; +	struct uv2_bau_msg_header *uv2_hdr;  	struct bau_control *bcp;  	/* -	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) -	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub +	 * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC) +	 * per cpu; and one per cpu on the uvhub (ADP_SZ)  	 */ -	bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE -				* UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); +	dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC; +	bau_desc = kmalloc_node(dsize, GFP_KERNEL, node);  	BUG_ON(!bau_desc); -	pa = uv_gpa(bau_desc); /* need the real nasid*/ -	n = pa >> uv_nshift; -	m = pa & uv_mmask; - -	uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, -			      (n << UV_DESC_BASE_PNODE_SHIFT | m)); +	gpa = uv_gpa(bau_desc); +	n = uv_gpa_to_gnode(gpa); +	m = uv_gpa_to_offset(gpa); +	if (is_uv1_hub()) +		uv1 = 1; +	/* the 14-bit pnode */ +	write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));  	/* -	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each +	 * Initializing all 8 (ITEMS_PER_DESC) descriptors for each  	 * cpu even though we only use the first one; one descriptor can  	 * describe a broadcast to 256 uv hubs.  	 */ -	for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); -		i++, bd2++) { +	for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {  		memset(bd2, 0, sizeof(struct bau_desc)); -		bd2->header.sw_ack_flag = 1; -		/* -		 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub -		 * in the partition. The bit map will indicate uvhub numbers, -		 * which are 0-N in a partition. Pnodes are unique system-wide. -		 */ -		bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1; -		bd2->header.dest_subnodeid = 0x10; /* the LB */ -		bd2->header.command = UV_NET_ENDPOINT_INTD; -		bd2->header.int_both = 1; -		/* -		 * all others need to be set to zero: -		 *   fairness chaining multilevel count replied_to -		 */ +		if (uv1) { +			uv1_hdr = &bd2->header.uv1_hdr; +			uv1_hdr->swack_flag =	1; +			/* +			 * The base_dest_nasid set in the message header +			 * is the nasid of the first uvhub in the partition. +			 * The bit map will indicate destination pnode numbers +			 * relative to that base. They may not be consecutive +			 * if nasid striding is being used. +			 */ +			uv1_hdr->base_dest_nasid = +						UV_PNODE_TO_NASID(base_pnode); +			uv1_hdr->dest_subnodeid =	UV_LB_SUBNODEID; +			uv1_hdr->command =		UV_NET_ENDPOINT_INTD; +			uv1_hdr->int_both =		1; +			/* +			 * all others need to be set to zero: +			 *   fairness chaining multilevel count replied_to +			 */ +		} else { +			/* +			 * BIOS uses legacy mode, but UV2 hardware always +			 * uses native mode for selective broadcasts. +			 */ +			uv2_hdr = &bd2->header.uv2_hdr; +			uv2_hdr->swack_flag =	1; +			uv2_hdr->base_dest_nasid = +						UV_PNODE_TO_NASID(base_pnode); +			uv2_hdr->dest_subnodeid =	UV_LB_SUBNODEID; +			uv2_hdr->command =		UV_NET_ENDPOINT_INTD; +		}  	}  	for_each_present_cpu(cpu) {  		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) @@ -1391,57 +1764,56 @@ uv_activation_descriptor_init(int node, int pnode)   * - node is first node (kernel memory notion) on the uvhub   * - pnode is the uvhub's physical identifier   */ -static void -uv_payload_queue_init(int node, int pnode) +static void pq_init(int node, int pnode)  { -	int pn;  	int cpu; +	size_t plsize;  	char *cp; -	unsigned long pa; -	struct bau_payload_queue_entry *pqp; -	struct bau_payload_queue_entry *pqp_malloc; +	void *vp; +	unsigned long pn; +	unsigned long first; +	unsigned long pn_first; +	unsigned long last; +	struct bau_pq_entry *pqp;  	struct bau_control *bcp; -	pqp = kmalloc_node((DEST_Q_SIZE + 1) -			   * sizeof(struct bau_payload_queue_entry), -			   GFP_KERNEL, node); +	plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry); +	vp = kmalloc_node(plsize, GFP_KERNEL, node); +	pqp = (struct bau_pq_entry *)vp;  	BUG_ON(!pqp); -	pqp_malloc = pqp;  	cp = (char *)pqp + 31; -	pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); +	pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5);  	for_each_present_cpu(cpu) {  		if (pnode != uv_cpu_to_pnode(cpu))  			continue;  		/* for every cpu on this pnode: */  		bcp = &per_cpu(bau_control, cpu); -		bcp->va_queue_first = pqp; -		bcp->bau_msg_head = pqp; -		bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1); +		bcp->queue_first	= pqp; +		bcp->bau_msg_head	= pqp; +		bcp->queue_last		= pqp + (DEST_Q_SIZE - 1);  	}  	/* -	 * need the pnode of where the memory was really allocated +	 * need the gnode of where the memory was really allocated  	 */ -	pa = uv_gpa(pqp); -	pn = pa >> uv_nshift; -	uv_write_global_mmr64(pnode, -			      UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, -			      ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | -			      uv_physnodeaddr(pqp)); -	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, -			      uv_physnodeaddr(pqp)); -	uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, -			      (unsigned long) -			      uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1))); +	pn = uv_gpa_to_gnode(uv_gpa(pqp)); +	first = uv_physnodeaddr(pqp); +	pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first; +	last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)); +	write_mmr_payload_first(pnode, pn_first); +	write_mmr_payload_tail(pnode, first); +	write_mmr_payload_last(pnode, last); +	write_gmmr_sw_ack(pnode, 0xffffUL); +  	/* in effect, all msg_type's are set to MSG_NOOP */ -	memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); +	memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);  }  /*   * Initialization of each UV hub's structures   */ -static void __init uv_init_uvhub(int uvhub, int vector) +static void __init init_uvhub(int uvhub, int vector, int base_pnode)  {  	int node;  	int pnode; @@ -1449,24 +1821,24 @@ static void __init uv_init_uvhub(int uvhub, int vector)  	node = uvhub_to_first_node(uvhub);  	pnode = uv_blade_to_pnode(uvhub); -	uv_activation_descriptor_init(node, pnode); -	uv_payload_queue_init(node, pnode); + +	activation_descriptor_init(node, pnode, base_pnode); + +	pq_init(node, pnode);  	/* -	 * the below initialization can't be in firmware because the -	 * messaging IRQ will be determined by the OS +	 * The below initialization can't be in firmware because the +	 * messaging IRQ will be determined by the OS.  	 */ -	apicid = uvhub_to_first_apicid(uvhub); -	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, -				      ((apicid << 32) | vector)); +	apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; +	write_mmr_data_config(pnode, ((apicid << 32) | vector));  }  /*   * We will set BAU_MISC_CONTROL with a timeout period.   * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. - * So the destination timeout period has be be calculated from them. + * So the destination timeout period has to be calculated from them.   */ -static int -calculate_destination_timeout(void) +static int calculate_destination_timeout(void)  {  	unsigned long mmr_image;  	int mult1; @@ -1476,125 +1848,254 @@ calculate_destination_timeout(void)  	int ret;  	unsigned long ts_ns; -	mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; -	mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); -	index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; -	mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); -	mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; -	base = timeout_base_ns[index]; -	ts_ns = base * mult1 * mult2; -	ret = ts_ns / 1000; +	if (is_uv1_hub()) { +		mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; +		mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); +		index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; +		mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); +		mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; +		ts_ns = timeout_base_ns[index]; +		ts_ns *= (mult1 * mult2); +		ret = ts_ns / 1000; +	} else { +		/* 4 bits  0/1 for 10/80us base, 3 bits of multiplier */ +		mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL); +		mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; +		if (mmr_image & (1L << UV2_ACK_UNITS_SHFT)) +			base = 80; +		else +			base = 10; +		mult1 = mmr_image & UV2_ACK_MASK; +		ret = mult1 * base; +	}  	return ret;  } +static void __init init_per_cpu_tunables(void) +{ +	int cpu; +	struct bau_control *bcp; + +	for_each_present_cpu(cpu) { +		bcp = &per_cpu(bau_control, cpu); +		bcp->baudisabled		= 0; +		if (nobau) +			bcp->nobau		= 1; +		bcp->statp			= &per_cpu(ptcstats, cpu); +		/* time interval to catch a hardware stay-busy bug */ +		bcp->timeout_interval		= usec_2_cycles(2*timeout_us); +		bcp->max_concurr		= max_concurr; +		bcp->max_concurr_const		= max_concurr; +		bcp->plugged_delay		= plugged_delay; +		bcp->plugsb4reset		= plugsb4reset; +		bcp->timeoutsb4reset		= timeoutsb4reset; +		bcp->ipi_reset_limit		= ipi_reset_limit; +		bcp->complete_threshold		= complete_threshold; +		bcp->cong_response_us		= congested_respns_us; +		bcp->cong_reps			= congested_reps; +		bcp->disabled_period =		sec_2_cycles(disabled_period); +		bcp->giveup_limit =		giveup_limit; +		spin_lock_init(&bcp->queue_lock); +		spin_lock_init(&bcp->uvhub_lock); +		spin_lock_init(&bcp->disable_lock); +	} +} +  /* - * initialize the bau_control structure for each cpu + * Scan all cpus to collect blade and socket summaries.   */ -static void __init uv_init_per_cpu(int nuvhubs) +static int __init get_cpu_topology(int base_pnode, +					struct uvhub_desc *uvhub_descs, +					unsigned char *uvhub_mask)  { -	int i;  	int cpu;  	int pnode;  	int uvhub; -	int have_hmaster; -	short socket = 0; -	unsigned short socket_mask; -	unsigned char *uvhub_mask; +	int socket;  	struct bau_control *bcp;  	struct uvhub_desc *bdp;  	struct socket_desc *sdp; -	struct bau_control *hmaster = NULL; -	struct bau_control *smaster = NULL; -	struct socket_desc { -		short num_cpus; -		short cpu_number[16]; -	}; -	struct uvhub_desc { -		unsigned short socket_mask; -		short num_cpus; -		short uvhub; -		short pnode; -		struct socket_desc socket[2]; -	}; -	struct uvhub_desc *uvhub_descs; - -	timeout_us = calculate_destination_timeout(); -	uvhub_descs = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); -	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); -	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);  	for_each_present_cpu(cpu) {  		bcp = &per_cpu(bau_control, cpu); +  		memset(bcp, 0, sizeof(struct bau_control)); +  		pnode = uv_cpu_hub_info(cpu)->pnode; +		if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) { +			printk(KERN_EMERG +				"cpu %d pnode %d-%d beyond %d; BAU disabled\n", +				cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE); +			return 1; +		} + +		bcp->osnode = cpu_to_node(cpu); +		bcp->partition_base_pnode = base_pnode; +  		uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;  		*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));  		bdp = &uvhub_descs[uvhub]; +  		bdp->num_cpus++;  		bdp->uvhub = uvhub;  		bdp->pnode = pnode; +  		/* kludge: 'assuming' one node per socket, and assuming that  		   disabling a socket just leaves a gap in node numbers */ -		socket = (cpu_to_node(cpu) & 1); +		socket = bcp->osnode & 1;  		bdp->socket_mask |= (1 << socket);  		sdp = &bdp->socket[socket];  		sdp->cpu_number[sdp->num_cpus] = cpu;  		sdp->num_cpus++; +		if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) { +			printk(KERN_EMERG "%d cpus per socket invalid\n", +				sdp->num_cpus); +			return 1; +		}  	} +	return 0; +} + +/* + * Each socket is to get a local array of pnodes/hubs. + */ +static void make_per_cpu_thp(struct bau_control *smaster) +{ +	int cpu; +	size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus(); + +	smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode); +	memset(smaster->thp, 0, hpsz); +	for_each_present_cpu(cpu) { +		smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode; +		smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; +	} +} + +/* + * Each uvhub is to get a local cpumask. + */ +static void make_per_hub_cpumask(struct bau_control *hmaster) +{ +	int sz = sizeof(cpumask_t); + +	hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode); +} + +/* + * Initialize all the per_cpu information for the cpu's on a given socket, + * given what has been gathered into the socket_desc struct. + * And reports the chosen hub and socket masters back to the caller. + */ +static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, +			struct bau_control **smasterp, +			struct bau_control **hmasterp) +{ +	int i; +	int cpu; +	struct bau_control *bcp; + +	for (i = 0; i < sdp->num_cpus; i++) { +		cpu = sdp->cpu_number[i]; +		bcp = &per_cpu(bau_control, cpu); +		bcp->cpu = cpu; +		if (i == 0) { +			*smasterp = bcp; +			if (!(*hmasterp)) +				*hmasterp = bcp; +		} +		bcp->cpus_in_uvhub = bdp->num_cpus; +		bcp->cpus_in_socket = sdp->num_cpus; +		bcp->socket_master = *smasterp; +		bcp->uvhub = bdp->uvhub; +		if (is_uv1_hub()) +			bcp->uvhub_version = 1; +		else if (is_uv2_hub()) +			bcp->uvhub_version = 2; +		else { +			printk(KERN_EMERG "uvhub version not 1 or 2\n"); +			return 1; +		} +		bcp->uvhub_master = *hmasterp; +		bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; +		if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { +			printk(KERN_EMERG "%d cpus per uvhub invalid\n", +				bcp->uvhub_cpu); +			return 1; +		} +	} +	return 0; +} + +/* + * Summarize the blade and socket topology into the per_cpu structures. + */ +static int __init summarize_uvhub_sockets(int nuvhubs, +			struct uvhub_desc *uvhub_descs, +			unsigned char *uvhub_mask) +{ +	int socket; +	int uvhub; +	unsigned short socket_mask; +  	for (uvhub = 0; uvhub < nuvhubs; uvhub++) { +		struct uvhub_desc *bdp; +		struct bau_control *smaster = NULL; +		struct bau_control *hmaster = NULL; +  		if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))  			continue; -		have_hmaster = 0; +  		bdp = &uvhub_descs[uvhub];  		socket_mask = bdp->socket_mask;  		socket = 0;  		while (socket_mask) { -			if (!(socket_mask & 1)) -				goto nextsocket; -			sdp = &bdp->socket[socket]; -			for (i = 0; i < sdp->num_cpus; i++) { -				cpu = sdp->cpu_number[i]; -				bcp = &per_cpu(bau_control, cpu); -				bcp->cpu = cpu; -				if (i == 0) { -					smaster = bcp; -					if (!have_hmaster) { -						have_hmaster++; -						hmaster = bcp; -					} -				} -				bcp->cpus_in_uvhub = bdp->num_cpus; -				bcp->cpus_in_socket = sdp->num_cpus; -				bcp->socket_master = smaster; -				bcp->uvhub = bdp->uvhub; -				bcp->uvhub_master = hmaster; -				bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> -						blade_processor_id; +			struct socket_desc *sdp; +			if ((socket_mask & 1)) { +				sdp = &bdp->socket[socket]; +				if (scan_sock(sdp, bdp, &smaster, &hmaster)) +					return 1; +				make_per_cpu_thp(smaster);  			} -nextsocket:  			socket++;  			socket_mask = (socket_mask >> 1);  		} +		make_per_hub_cpumask(hmaster);  	} +	return 0; +} + +/* + * initialize the bau_control structure for each cpu + */ +static int __init init_per_cpu(int nuvhubs, int base_part_pnode) +{ +	unsigned char *uvhub_mask; +	void *vp; +	struct uvhub_desc *uvhub_descs; + +	timeout_us = calculate_destination_timeout(); + +	vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); +	uvhub_descs = (struct uvhub_desc *)vp; +	memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); +	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); + +	if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask)) +		goto fail; + +	if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask)) +		goto fail; +  	kfree(uvhub_descs);  	kfree(uvhub_mask); -	for_each_present_cpu(cpu) { -		bcp = &per_cpu(bau_control, cpu); -		bcp->baudisabled = 0; -		bcp->statp = &per_cpu(ptcstats, cpu); -		/* time interval to catch a hardware stay-busy bug */ -		bcp->timeout_interval = microsec_2_cycles(2*timeout_us); -		bcp->max_bau_concurrent = max_bau_concurrent; -		bcp->max_bau_concurrent_constant = max_bau_concurrent; -		bcp->plugged_delay = plugged_delay; -		bcp->plugsb4reset = plugsb4reset; -		bcp->timeoutsb4reset = timeoutsb4reset; -		bcp->ipi_reset_limit = ipi_reset_limit; -		bcp->complete_threshold = complete_threshold; -		bcp->congested_response_us = congested_response_us; -		bcp->congested_reps = congested_reps; -		bcp->congested_period = congested_period; -	} +	init_per_cpu_tunables(); +	return 0; + +fail: +	kfree(uvhub_descs); +	kfree(uvhub_mask); +	return 1;  }  /* @@ -1606,51 +2107,54 @@ static int __init uv_bau_init(void)  	int pnode;  	int nuvhubs;  	int cur_cpu; +	int cpus;  	int vector; -	unsigned long mmr; +	cpumask_var_t *mask;  	if (!is_uv_system())  		return 0; -	if (nobau) -		return 0; - -	for_each_possible_cpu(cur_cpu) -		zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), -				       GFP_KERNEL, cpu_to_node(cur_cpu)); +	for_each_possible_cpu(cur_cpu) { +		mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); +		zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); +	} -	uv_nshift = uv_hub_info->m_val; -	uv_mmask = (1UL << uv_hub_info->m_val) - 1;  	nuvhubs = uv_num_possible_blades(); -	spin_lock_init(&disable_lock); -	congested_cycles = microsec_2_cycles(congested_response_us); +	congested_cycles = usec_2_cycles(congested_respns_us); -	uv_init_per_cpu(nuvhubs); +	uv_base_pnode = 0x7fffffff; +	for (uvhub = 0; uvhub < nuvhubs; uvhub++) { +		cpus = uv_blade_nr_possible_cpus(uvhub); +		if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode)) +			uv_base_pnode = uv_blade_to_pnode(uvhub); +	} + +	enable_timeouts(); -	uv_partition_base_pnode = 0x7fffffff; -	for (uvhub = 0; uvhub < nuvhubs; uvhub++) -		if (uv_blade_nr_possible_cpus(uvhub) && -			(uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) -			uv_partition_base_pnode = uv_blade_to_pnode(uvhub); +	if (init_per_cpu(nuvhubs, uv_base_pnode)) { +		set_bau_off(); +		nobau_perm = 1; +		return 0; +	}  	vector = UV_BAU_MESSAGE;  	for_each_possible_blade(uvhub)  		if (uv_blade_nr_possible_cpus(uvhub)) -			uv_init_uvhub(uvhub, vector); +			init_uvhub(uvhub, vector, uv_base_pnode); -	uv_enable_timeouts();  	alloc_intr_gate(vector, uv_bau_message_intr1);  	for_each_possible_blade(uvhub) {  		if (uv_blade_nr_possible_cpus(uvhub)) { +			unsigned long val; +			unsigned long mmr;  			pnode = uv_blade_to_pnode(uvhub);  			/* INIT the bau */ -			uv_write_global_mmr64(pnode, -					UVH_LB_BAU_SB_ACTIVATION_CONTROL, -					((unsigned long)1 << 63)); +			val = 1L << 63; +			write_gmmr_activation(pnode, val);  			mmr = 1; /* should be 1 to broadcast to both sockets */ -			uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, -						mmr); +			if (!is_uv1_hub()) +				write_mmr_data_broadcast(pnode, mmr);  		}  	} diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 7b24460917d..b233681af4d 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -25,7 +25,7 @@ struct uv_irq_2_mmr_pnode{  	int			irq;  }; -static spinlock_t		uv_irq_lock; +static DEFINE_SPINLOCK(uv_irq_lock);  static struct rb_root		uv_irq_root;  static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool); @@ -131,10 +131,11 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,  		       unsigned long mmr_offset, int limit)  {  	const struct cpumask *eligible_cpu = cpumask_of(cpu); -	struct irq_cfg *cfg = get_irq_chip_data(irq); +	struct irq_cfg *cfg = irq_get_chip_data(irq);  	unsigned long mmr_value;  	struct uv_IO_APIC_route_entry *entry;  	int mmr_pnode, err; +	unsigned int dest;  	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=  			sizeof(unsigned long)); @@ -143,12 +144,16 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,  	if (err != 0)  		return err; +	err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest); +	if (err != 0) +		return err; +  	if (limit == UV_AFFINITY_CPU)  		irq_set_status_flags(irq, IRQ_NO_BALANCING);  	else  		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); -	set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, +	irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,  				      irq_name);  	mmr_value = 0; @@ -159,7 +164,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,  	entry->polarity		= 0;  	entry->trigger		= 0;  	entry->mask		= 0; -	entry->dest		= apic->cpu_mask_to_apicid(eligible_cpu); +	entry->dest		= dest;  	mmr_pnode = uv_blade_to_pnode(mmr_blade);  	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -222,7 +227,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,  	if (cfg->move_in_progress)  		send_cleanup_vector(cfg); -	return 0; +	return IRQ_SET_MASK_OK_NOCOPY;  }  /* @@ -233,11 +238,9 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,  int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,  		 unsigned long mmr_offset, int limit)  { -	int irq, ret; - -	irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade)); +	int ret, irq = irq_alloc_hwirq(uv_blade_to_memory_nid(mmr_blade)); -	if (irq <= 0) +	if (!irq)  		return -EBUSY;  	ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, @@ -245,7 +248,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,  	if (ret == irq)  		uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);  	else -		destroy_irq(irq); +		irq_free_hwirq(irq);  	return ret;  } @@ -280,6 +283,6 @@ void uv_teardown_irq(unsigned int irq)  			n = n->rb_right;  	}  	spin_unlock_irqrestore(&uv_irq_lock, irqflags); -	destroy_irq(irq); +	irq_free_hwirq(irq);  }  EXPORT_SYMBOL_GPL(uv_teardown_irq); diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c new file mode 100644 index 00000000000..c89c93320c1 --- /dev/null +++ b/arch/x86/platform/uv/uv_nmi.c @@ -0,0 +1,727 @@ +/* + * SGI NMI support routines + * + *  This program is free software; you can redistribute it and/or modify + *  it under the terms of the GNU General Public License as published by + *  the Free Software Foundation; either version 2 of the License, or + *  (at your option) any later version. + * + *  This program is distributed in the hope that it will be useful, + *  but WITHOUT ANY WARRANTY; without even the implied warranty of + *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + *  GNU General Public License for more details. + * + *  You should have received a copy of the GNU General Public License + *  along with this program; if not, write to the Free Software + *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA + * + *  Copyright (c) 2009-2013 Silicon Graphics, Inc.  All Rights Reserved. + *  Copyright (c) Mike Travis + */ + +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/kdb.h> +#include <linux/kexec.h> +#include <linux/kgdb.h> +#include <linux/module.h> +#include <linux/nmi.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include <asm/apic.h> +#include <asm/current.h> +#include <asm/kdebug.h> +#include <asm/local64.h> +#include <asm/nmi.h> +#include <asm/traps.h> +#include <asm/uv/uv.h> +#include <asm/uv/uv_hub.h> +#include <asm/uv/uv_mmrs.h> + +/* + * UV handler for NMI + * + * Handle system-wide NMI events generated by the global 'power nmi' command. + * + * Basic operation is to field the NMI interrupt on each cpu and wait + * until all cpus have arrived into the nmi handler.  If some cpus do not + * make it into the handler, try and force them in with the IPI(NMI) signal. + * + * We also have to lessen UV Hub MMR accesses as much as possible as this + * disrupts the UV Hub's primary mission of directing NumaLink traffic and + * can cause system problems to occur. + * + * To do this we register our primary NMI notifier on the NMI_UNKNOWN + * chain.  This reduces the number of false NMI calls when the perf + * tools are running which generate an enormous number of NMIs per + * second (~4M/s for 1024 cpu threads).  Our secondary NMI handler is + * very short as it only checks that if it has been "pinged" with the + * IPI(NMI) signal as mentioned above, and does not read the UV Hub's MMR. + * + */ + +static struct uv_hub_nmi_s **uv_hub_nmi_list; + +DEFINE_PER_CPU(struct uv_cpu_nmi_s, __uv_cpu_nmi); +EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_nmi); + +static unsigned long nmi_mmr; +static unsigned long nmi_mmr_clear; +static unsigned long nmi_mmr_pending; + +static atomic_t	uv_in_nmi; +static atomic_t uv_nmi_cpu = ATOMIC_INIT(-1); +static atomic_t uv_nmi_cpus_in_nmi = ATOMIC_INIT(-1); +static atomic_t uv_nmi_slave_continue; +static cpumask_var_t uv_nmi_cpu_mask; + +/* Values for uv_nmi_slave_continue */ +#define SLAVE_CLEAR	0 +#define SLAVE_CONTINUE	1 +#define SLAVE_EXIT	2 + +/* + * Default is all stack dumps go to the console and buffer. + * Lower level to send to log buffer only. + */ +static int uv_nmi_loglevel = CONSOLE_LOGLEVEL_DEFAULT; +module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644); + +/* + * The following values show statistics on how perf events are affecting + * this system. + */ +static int param_get_local64(char *buffer, const struct kernel_param *kp) +{ +	return sprintf(buffer, "%lu\n", local64_read((local64_t *)kp->arg)); +} + +static int param_set_local64(const char *val, const struct kernel_param *kp) +{ +	/* clear on any write */ +	local64_set((local64_t *)kp->arg, 0); +	return 0; +} + +static struct kernel_param_ops param_ops_local64 = { +	.get = param_get_local64, +	.set = param_set_local64, +}; +#define param_check_local64(name, p) __param_check(name, p, local64_t) + +static local64_t uv_nmi_count; +module_param_named(nmi_count, uv_nmi_count, local64, 0644); + +static local64_t uv_nmi_misses; +module_param_named(nmi_misses, uv_nmi_misses, local64, 0644); + +static local64_t uv_nmi_ping_count; +module_param_named(ping_count, uv_nmi_ping_count, local64, 0644); + +static local64_t uv_nmi_ping_misses; +module_param_named(ping_misses, uv_nmi_ping_misses, local64, 0644); + +/* + * Following values allow tuning for large systems under heavy loading + */ +static int uv_nmi_initial_delay = 100; +module_param_named(initial_delay, uv_nmi_initial_delay, int, 0644); + +static int uv_nmi_slave_delay = 100; +module_param_named(slave_delay, uv_nmi_slave_delay, int, 0644); + +static int uv_nmi_loop_delay = 100; +module_param_named(loop_delay, uv_nmi_loop_delay, int, 0644); + +static int uv_nmi_trigger_delay = 10000; +module_param_named(trigger_delay, uv_nmi_trigger_delay, int, 0644); + +static int uv_nmi_wait_count = 100; +module_param_named(wait_count, uv_nmi_wait_count, int, 0644); + +static int uv_nmi_retry_count = 500; +module_param_named(retry_count, uv_nmi_retry_count, int, 0644); + +/* + * Valid NMI Actions: + *  "dump"	- dump process stack for each cpu + *  "ips"	- dump IP info for each cpu + *  "kdump"	- do crash dump + *  "kdb"	- enter KDB (default) + *  "kgdb"	- enter KGDB + */ +static char uv_nmi_action[8] = "kdb"; +module_param_string(action, uv_nmi_action, sizeof(uv_nmi_action), 0644); + +static inline bool uv_nmi_action_is(const char *action) +{ +	return (strncmp(uv_nmi_action, action, strlen(action)) == 0); +} + +/* Setup which NMI support is present in system */ +static void uv_nmi_setup_mmrs(void) +{ +	if (uv_read_local_mmr(UVH_NMI_MMRX_SUPPORTED)) { +		uv_write_local_mmr(UVH_NMI_MMRX_REQ, +					1UL << UVH_NMI_MMRX_REQ_SHIFT); +		nmi_mmr = UVH_NMI_MMRX; +		nmi_mmr_clear = UVH_NMI_MMRX_CLEAR; +		nmi_mmr_pending = 1UL << UVH_NMI_MMRX_SHIFT; +		pr_info("UV: SMI NMI support: %s\n", UVH_NMI_MMRX_TYPE); +	} else { +		nmi_mmr = UVH_NMI_MMR; +		nmi_mmr_clear = UVH_NMI_MMR_CLEAR; +		nmi_mmr_pending = 1UL << UVH_NMI_MMR_SHIFT; +		pr_info("UV: SMI NMI support: %s\n", UVH_NMI_MMR_TYPE); +	} +} + +/* Read NMI MMR and check if NMI flag was set by BMC. */ +static inline int uv_nmi_test_mmr(struct uv_hub_nmi_s *hub_nmi) +{ +	hub_nmi->nmi_value = uv_read_local_mmr(nmi_mmr); +	atomic_inc(&hub_nmi->read_mmr_count); +	return !!(hub_nmi->nmi_value & nmi_mmr_pending); +} + +static inline void uv_local_mmr_clear_nmi(void) +{ +	uv_write_local_mmr(nmi_mmr_clear, nmi_mmr_pending); +} + +/* + * If first cpu in on this hub, set hub_nmi "in_nmi" and "owner" values and + * return true.  If first cpu in on the system, set global "in_nmi" flag. + */ +static int uv_set_in_nmi(int cpu, struct uv_hub_nmi_s *hub_nmi) +{ +	int first = atomic_add_unless(&hub_nmi->in_nmi, 1, 1); + +	if (first) { +		atomic_set(&hub_nmi->cpu_owner, cpu); +		if (atomic_add_unless(&uv_in_nmi, 1, 1)) +			atomic_set(&uv_nmi_cpu, cpu); + +		atomic_inc(&hub_nmi->nmi_count); +	} +	return first; +} + +/* Check if this is a system NMI event */ +static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi) +{ +	int cpu = smp_processor_id(); +	int nmi = 0; + +	local64_inc(&uv_nmi_count); +	uv_cpu_nmi.queries++; + +	do { +		nmi = atomic_read(&hub_nmi->in_nmi); +		if (nmi) +			break; + +		if (raw_spin_trylock(&hub_nmi->nmi_lock)) { + +			/* check hub MMR NMI flag */ +			if (uv_nmi_test_mmr(hub_nmi)) { +				uv_set_in_nmi(cpu, hub_nmi); +				nmi = 1; +				break; +			} + +			/* MMR NMI flag is clear */ +			raw_spin_unlock(&hub_nmi->nmi_lock); + +		} else { +			/* wait a moment for the hub nmi locker to set flag */ +			cpu_relax(); +			udelay(uv_nmi_slave_delay); + +			/* re-check hub in_nmi flag */ +			nmi = atomic_read(&hub_nmi->in_nmi); +			if (nmi) +				break; +		} + +		/* check if this BMC missed setting the MMR NMI flag */ +		if (!nmi) { +			nmi = atomic_read(&uv_in_nmi); +			if (nmi) +				uv_set_in_nmi(cpu, hub_nmi); +		} + +	} while (0); + +	if (!nmi) +		local64_inc(&uv_nmi_misses); + +	return nmi; +} + +/* Need to reset the NMI MMR register, but only once per hub. */ +static inline void uv_clear_nmi(int cpu) +{ +	struct uv_hub_nmi_s *hub_nmi = uv_hub_nmi; + +	if (cpu == atomic_read(&hub_nmi->cpu_owner)) { +		atomic_set(&hub_nmi->cpu_owner, -1); +		atomic_set(&hub_nmi->in_nmi, 0); +		uv_local_mmr_clear_nmi(); +		raw_spin_unlock(&hub_nmi->nmi_lock); +	} +} + +/* Print non-responding cpus */ +static void uv_nmi_nr_cpus_pr(char *fmt) +{ +	static char cpu_list[1024]; +	int len = sizeof(cpu_list); +	int c = cpumask_weight(uv_nmi_cpu_mask); +	int n = cpulist_scnprintf(cpu_list, len, uv_nmi_cpu_mask); + +	if (n >= len-1) +		strcpy(&cpu_list[len - 6], "...\n"); + +	printk(fmt, c, cpu_list); +} + +/* Ping non-responding cpus attemping to force them into the NMI handler */ +static void uv_nmi_nr_cpus_ping(void) +{ +	int cpu; + +	for_each_cpu(cpu, uv_nmi_cpu_mask) +		atomic_set(&uv_cpu_nmi_per(cpu).pinging, 1); + +	apic->send_IPI_mask(uv_nmi_cpu_mask, APIC_DM_NMI); +} + +/* Clean up flags for cpus that ignored both NMI and ping */ +static void uv_nmi_cleanup_mask(void) +{ +	int cpu; + +	for_each_cpu(cpu, uv_nmi_cpu_mask) { +		atomic_set(&uv_cpu_nmi_per(cpu).pinging, 0); +		atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_OUT); +		cpumask_clear_cpu(cpu, uv_nmi_cpu_mask); +	} +} + +/* Loop waiting as cpus enter nmi handler */ +static int uv_nmi_wait_cpus(int first) +{ +	int i, j, k, n = num_online_cpus(); +	int last_k = 0, waiting = 0; + +	if (first) { +		cpumask_copy(uv_nmi_cpu_mask, cpu_online_mask); +		k = 0; +	} else { +		k = n - cpumask_weight(uv_nmi_cpu_mask); +	} + +	udelay(uv_nmi_initial_delay); +	for (i = 0; i < uv_nmi_retry_count; i++) { +		int loop_delay = uv_nmi_loop_delay; + +		for_each_cpu(j, uv_nmi_cpu_mask) { +			if (atomic_read(&uv_cpu_nmi_per(j).state)) { +				cpumask_clear_cpu(j, uv_nmi_cpu_mask); +				if (++k >= n) +					break; +			} +		} +		if (k >= n) {		/* all in? */ +			k = n; +			break; +		} +		if (last_k != k) {	/* abort if no new cpus coming in */ +			last_k = k; +			waiting = 0; +		} else if (++waiting > uv_nmi_wait_count) +			break; + +		/* extend delay if waiting only for cpu 0 */ +		if (waiting && (n - k) == 1 && +		    cpumask_test_cpu(0, uv_nmi_cpu_mask)) +			loop_delay *= 100; + +		udelay(loop_delay); +	} +	atomic_set(&uv_nmi_cpus_in_nmi, k); +	return n - k; +} + +/* Wait until all slave cpus have entered UV NMI handler */ +static void uv_nmi_wait(int master) +{ +	/* indicate this cpu is in */ +	atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_IN); + +	/* if not the first cpu in (the master), then we are a slave cpu */ +	if (!master) +		return; + +	do { +		/* wait for all other cpus to gather here */ +		if (!uv_nmi_wait_cpus(1)) +			break; + +		/* if not all made it in, send IPI NMI to them */ +		uv_nmi_nr_cpus_pr(KERN_ALERT +			"UV: Sending NMI IPI to %d non-responding CPUs: %s\n"); +		uv_nmi_nr_cpus_ping(); + +		/* if all cpus are in, then done */ +		if (!uv_nmi_wait_cpus(0)) +			break; + +		uv_nmi_nr_cpus_pr(KERN_ALERT +			"UV: %d CPUs not in NMI loop: %s\n"); +	} while (0); + +	pr_alert("UV: %d of %d CPUs in NMI\n", +		atomic_read(&uv_nmi_cpus_in_nmi), num_online_cpus()); +} + +static void uv_nmi_dump_cpu_ip_hdr(void) +{ +	printk(KERN_DEFAULT +		"\nUV: %4s %6s %-32s %s   (Note: PID 0 not listed)\n", +		"CPU", "PID", "COMMAND", "IP"); +} + +static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs) +{ +	printk(KERN_DEFAULT "UV: %4d %6d %-32.32s ", +		cpu, current->pid, current->comm); + +	printk_address(regs->ip); +} + +/* Dump this cpu's state */ +static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs) +{ +	const char *dots = " ................................. "; + +	if (uv_nmi_action_is("ips")) { +		if (cpu == 0) +			uv_nmi_dump_cpu_ip_hdr(); + +		if (current->pid != 0) +			uv_nmi_dump_cpu_ip(cpu, regs); + +	} else if (uv_nmi_action_is("dump")) { +		printk(KERN_DEFAULT +			"UV:%sNMI process trace for CPU %d\n", dots, cpu); +		show_regs(regs); +	} +	atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE); +} + +/* Trigger a slave cpu to dump it's state */ +static void uv_nmi_trigger_dump(int cpu) +{ +	int retry = uv_nmi_trigger_delay; + +	if (atomic_read(&uv_cpu_nmi_per(cpu).state) != UV_NMI_STATE_IN) +		return; + +	atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_DUMP); +	do { +		cpu_relax(); +		udelay(10); +		if (atomic_read(&uv_cpu_nmi_per(cpu).state) +				!= UV_NMI_STATE_DUMP) +			return; +	} while (--retry > 0); + +	pr_crit("UV: CPU %d stuck in process dump function\n", cpu); +	atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_DUMP_DONE); +} + +/* Wait until all cpus ready to exit */ +static void uv_nmi_sync_exit(int master) +{ +	atomic_dec(&uv_nmi_cpus_in_nmi); +	if (master) { +		while (atomic_read(&uv_nmi_cpus_in_nmi) > 0) +			cpu_relax(); +		atomic_set(&uv_nmi_slave_continue, SLAVE_CLEAR); +	} else { +		while (atomic_read(&uv_nmi_slave_continue)) +			cpu_relax(); +	} +} + +/* Walk through cpu list and dump state of each */ +static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master) +{ +	if (master) { +		int tcpu; +		int ignored = 0; +		int saved_console_loglevel = console_loglevel; + +		pr_alert("UV: tracing %s for %d CPUs from CPU %d\n", +			uv_nmi_action_is("ips") ? "IPs" : "processes", +			atomic_read(&uv_nmi_cpus_in_nmi), cpu); + +		console_loglevel = uv_nmi_loglevel; +		atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT); +		for_each_online_cpu(tcpu) { +			if (cpumask_test_cpu(tcpu, uv_nmi_cpu_mask)) +				ignored++; +			else if (tcpu == cpu) +				uv_nmi_dump_state_cpu(tcpu, regs); +			else +				uv_nmi_trigger_dump(tcpu); +		} +		if (ignored) +			printk(KERN_DEFAULT "UV: %d CPUs ignored NMI\n", +				ignored); + +		console_loglevel = saved_console_loglevel; +		pr_alert("UV: process trace complete\n"); +	} else { +		while (!atomic_read(&uv_nmi_slave_continue)) +			cpu_relax(); +		while (atomic_read(&uv_cpu_nmi.state) != UV_NMI_STATE_DUMP) +			cpu_relax(); +		uv_nmi_dump_state_cpu(cpu, regs); +	} +	uv_nmi_sync_exit(master); +} + +static void uv_nmi_touch_watchdogs(void) +{ +	touch_softlockup_watchdog_sync(); +	clocksource_touch_watchdog(); +	rcu_cpu_stall_reset(); +	touch_nmi_watchdog(); +} + +#if defined(CONFIG_KEXEC) +static atomic_t uv_nmi_kexec_failed; +static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) +{ +	/* Call crash to dump system state */ +	if (master) { +		pr_emerg("UV: NMI executing crash_kexec on CPU%d\n", cpu); +		crash_kexec(regs); + +		pr_emerg("UV: crash_kexec unexpectedly returned, "); +		if (!kexec_crash_image) { +			pr_cont("crash kernel not loaded\n"); +			atomic_set(&uv_nmi_kexec_failed, 1); +			uv_nmi_sync_exit(1); +			return; +		} +		pr_cont("kexec busy, stalling cpus while waiting\n"); +	} + +	/* If crash exec fails the slaves should return, otherwise stall */ +	while (atomic_read(&uv_nmi_kexec_failed) == 0) +		mdelay(10); + +	/* Crash kernel most likely not loaded, return in an orderly fashion */ +	uv_nmi_sync_exit(0); +} + +#else /* !CONFIG_KEXEC */ +static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs) +{ +	if (master) +		pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n"); +} +#endif /* !CONFIG_KEXEC */ + +#ifdef CONFIG_KGDB +#ifdef CONFIG_KGDB_KDB +static inline int uv_nmi_kdb_reason(void) +{ +	return KDB_REASON_SYSTEM_NMI; +} +#else /* !CONFIG_KGDB_KDB */ +static inline int uv_nmi_kdb_reason(void) +{ +	/* Insure user is expecting to attach gdb remote */ +	if (uv_nmi_action_is("kgdb")) +		return 0; + +	pr_err("UV: NMI error: KDB is not enabled in this kernel\n"); +	return -1; +} +#endif /* CONFIG_KGDB_KDB */ + +/* + * Call KGDB/KDB from NMI handler + * + * Note that if both KGDB and KDB are configured, then the action of 'kgdb' or + * 'kdb' has no affect on which is used.  See the KGDB documention for further + * information. + */ +static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master) +{ +	if (master) { +		int reason = uv_nmi_kdb_reason(); +		int ret; + +		if (reason < 0) +			return; + +		/* call KGDB NMI handler as MASTER */ +		ret = kgdb_nmicallin(cpu, X86_TRAP_NMI, regs, reason, +				&uv_nmi_slave_continue); +		if (ret) { +			pr_alert("KGDB returned error, is kgdboc set?\n"); +			atomic_set(&uv_nmi_slave_continue, SLAVE_EXIT); +		} +	} else { +		/* wait for KGDB signal that it's ready for slaves to enter */ +		int sig; + +		do { +			cpu_relax(); +			sig = atomic_read(&uv_nmi_slave_continue); +		} while (!sig); + +		/* call KGDB as slave */ +		if (sig == SLAVE_CONTINUE) +			kgdb_nmicallback(cpu, regs); +	} +	uv_nmi_sync_exit(master); +} + +#else /* !CONFIG_KGDB */ +static inline void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master) +{ +	pr_err("UV: NMI error: KGDB is not enabled in this kernel\n"); +} +#endif /* !CONFIG_KGDB */ + +/* + * UV NMI handler + */ +int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) +{ +	struct uv_hub_nmi_s *hub_nmi = uv_hub_nmi; +	int cpu = smp_processor_id(); +	int master = 0; +	unsigned long flags; + +	local_irq_save(flags); + +	/* If not a UV System NMI, ignore */ +	if (!atomic_read(&uv_cpu_nmi.pinging) && !uv_check_nmi(hub_nmi)) { +		local_irq_restore(flags); +		return NMI_DONE; +	} + +	/* Indicate we are the first CPU into the NMI handler */ +	master = (atomic_read(&uv_nmi_cpu) == cpu); + +	/* If NMI action is "kdump", then attempt to do it */ +	if (uv_nmi_action_is("kdump")) +		uv_nmi_kdump(cpu, master, regs); + +	/* Pause as all cpus enter the NMI handler */ +	uv_nmi_wait(master); + +	/* Dump state of each cpu */ +	if (uv_nmi_action_is("ips") || uv_nmi_action_is("dump")) +		uv_nmi_dump_state(cpu, regs, master); + +	/* Call KGDB/KDB if enabled */ +	else if (uv_nmi_action_is("kdb") || uv_nmi_action_is("kgdb")) +		uv_call_kgdb_kdb(cpu, regs, master); + +	/* Clear per_cpu "in nmi" flag */ +	atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_OUT); + +	/* Clear MMR NMI flag on each hub */ +	uv_clear_nmi(cpu); + +	/* Clear global flags */ +	if (master) { +		if (cpumask_weight(uv_nmi_cpu_mask)) +			uv_nmi_cleanup_mask(); +		atomic_set(&uv_nmi_cpus_in_nmi, -1); +		atomic_set(&uv_nmi_cpu, -1); +		atomic_set(&uv_in_nmi, 0); +	} + +	uv_nmi_touch_watchdogs(); +	local_irq_restore(flags); + +	return NMI_HANDLED; +} + +/* + * NMI handler for pulling in CPUs when perf events are grabbing our NMI + */ +static int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs) +{ +	int ret; + +	uv_cpu_nmi.queries++; +	if (!atomic_read(&uv_cpu_nmi.pinging)) { +		local64_inc(&uv_nmi_ping_misses); +		return NMI_DONE; +	} + +	uv_cpu_nmi.pings++; +	local64_inc(&uv_nmi_ping_count); +	ret = uv_handle_nmi(reason, regs); +	atomic_set(&uv_cpu_nmi.pinging, 0); +	return ret; +} + +static void uv_register_nmi_notifier(void) +{ +	if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv")) +		pr_warn("UV: NMI handler failed to register\n"); + +	if (register_nmi_handler(NMI_LOCAL, uv_handle_nmi_ping, 0, "uvping")) +		pr_warn("UV: PING NMI handler failed to register\n"); +} + +void uv_nmi_init(void) +{ +	unsigned int value; + +	/* +	 * Unmask NMI on all cpus +	 */ +	value = apic_read(APIC_LVT1) | APIC_DM_NMI; +	value &= ~APIC_LVT_MASKED; +	apic_write(APIC_LVT1, value); +} + +void uv_nmi_setup(void) +{ +	int size = sizeof(void *) * (1 << NODES_SHIFT); +	int cpu, nid; + +	/* Setup hub nmi info */ +	uv_nmi_setup_mmrs(); +	uv_hub_nmi_list = kzalloc(size, GFP_KERNEL); +	pr_info("UV: NMI hub list @ 0x%p (%d)\n", uv_hub_nmi_list, size); +	BUG_ON(!uv_hub_nmi_list); +	size = sizeof(struct uv_hub_nmi_s); +	for_each_present_cpu(cpu) { +		nid = cpu_to_node(cpu); +		if (uv_hub_nmi_list[nid] == NULL) { +			uv_hub_nmi_list[nid] = kzalloc_node(size, +							    GFP_KERNEL, nid); +			BUG_ON(!uv_hub_nmi_list[nid]); +			raw_spin_lock_init(&(uv_hub_nmi_list[nid]->nmi_lock)); +			atomic_set(&uv_hub_nmi_list[nid]->cpu_owner, -1); +		} +		uv_hub_nmi_per(cpu) = uv_hub_nmi_list[nid]; +	} +	BUG_ON(!alloc_cpumask_var(&uv_nmi_cpu_mask, GFP_KERNEL)); +	uv_register_nmi_notifier(); +} diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c index 309c70fb775..5d4ba301e77 100644 --- a/arch/x86/platform/uv/uv_sysfs.c +++ b/arch/x86/platform/uv/uv_sysfs.c @@ -19,7 +19,7 @@   *  Copyright (c) Russ Anderson   */ -#include <linux/sysdev.h> +#include <linux/device.h>  #include <asm/uv/bios.h>  #include <asm/uv/uv.h> diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 56e421bc379..5c86786bbfd 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -15,7 +15,7 @@   *  along with this program; if not, write to the Free Software   *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA   * - *  Copyright (c) 2009 Silicon Graphics, Inc.  All Rights Reserved. + *  Copyright (c) 2009-2013 Silicon Graphics, Inc.  All Rights Reserved.   *  Copyright (c) Dimitri Sivanich   */  #include <linux/clockchips.h> @@ -37,10 +37,9 @@ static void uv_rtc_timer_setup(enum clock_event_mode,  static struct clocksource clocksource_uv = {  	.name		= RTC_NAME, -	.rating		= 400, +	.rating		= 299,  	.read		= uv_read_rtc,  	.mask		= (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, -	.shift		= 10,  	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,  }; @@ -89,6 +88,7 @@ static void uv_rtc_send_IPI(int cpu)  	apicid = cpu_physical_id(cpu);  	pnode = uv_apicid_to_pnode(apicid); +	apicid |= uv_apicid_hibits;  	val = (1UL << UVH_IPI_INT_SEND_SHFT) |  	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) |  	      (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); @@ -99,25 +99,35 @@ static void uv_rtc_send_IPI(int cpu)  /* Check for an RTC interrupt pending */  static int uv_intr_pending(int pnode)  { -	return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & -		UVH_EVENT_OCCURRED0_RTC1_MASK; +	if (is_uv1_hub()) +		return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & +			UV1H_EVENT_OCCURRED0_RTC1_MASK; +	else if (is_uvx_hub()) +		return uv_read_global_mmr64(pnode, UVXH_EVENT_OCCURRED2) & +			UVXH_EVENT_OCCURRED2_RTC_1_MASK; +	return 0;  }  /* Setup interrupt and return non-zero if early expiration occurred. */  static int uv_setup_intr(int cpu, u64 expires)  {  	u64 val; +	unsigned long apicid = cpu_physical_id(cpu) | uv_apicid_hibits;  	int pnode = uv_cpu_to_pnode(cpu);  	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,  		UVH_RTC1_INT_CONFIG_M_MASK);  	uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); -	uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, -		UVH_EVENT_OCCURRED0_RTC1_MASK); +	if (is_uv1_hub()) +		uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, +				UV1H_EVENT_OCCURRED0_RTC1_MASK); +	else +		uv_write_global_mmr64(pnode, UVXH_EVENT_OCCURRED2_ALIAS, +				UVXH_EVENT_OCCURRED2_RTC_1_MASK);  	val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | -		((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); +		((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);  	/* Set configuration */  	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); @@ -149,10 +159,9 @@ static __init int uv_rtc_allocate_timers(void)  {  	int cpu; -	blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL); +	blade_info = kzalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);  	if (!blade_info)  		return -ENOMEM; -	memset(blade_info, 0, uv_possible_blades * sizeof(void *));  	for_each_present_cpu(cpu) {  		int nid = cpu_to_node(cpu); @@ -370,14 +379,7 @@ static __init int uv_rtc_setup_clock(void)  	if (!is_uv_system())  		return -ENODEV; -	clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, -				clocksource_uv.shift); - -	/* If single blade, prefer tsc */ -	if (uv_num_possible_blades() == 1) -		clocksource_uv.rating = 250; - -	rc = clocksource_register(&clocksource_uv); +	rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second);  	if (rc)  		printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);  	else  | 
