diff options
Diffstat (limited to 'arch/x86/kernel/tsc.c')
| -rw-r--r-- | arch/x86/kernel/tsc.c | 595 | 
1 files changed, 441 insertions, 154 deletions
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 0c40d8b7241..ea030319b32 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/kernel.h>  #include <linux/sched.h>  #include <linux/init.h> @@ -5,11 +7,11 @@  #include <linux/timer.h>  #include <linux/acpi_pmtmr.h>  #include <linux/cpufreq.h> -#include <linux/dmi.h>  #include <linux/delay.h>  #include <linux/clocksource.h>  #include <linux/percpu.h>  #include <linux/timex.h> +#include <linux/static_key.h>  #include <asm/hpet.h>  #include <asm/timer.h> @@ -36,13 +38,244 @@ static int __read_mostly tsc_unstable;     erroneous rdtsc usage on !cpu_has_tsc processors */  static int __read_mostly tsc_disabled = -1; -static int tsc_clocksource_reliable; +static struct static_key __use_tsc = STATIC_KEY_INIT; + +int tsc_clocksource_reliable; + +/* + * Use a ring-buffer like data structure, where a writer advances the head by + * writing a new data entry and a reader advances the tail when it observes a + * new entry. + * + * Writers are made to wait on readers until there's space to write a new + * entry. + * + * This means that we can always use an {offset, mul} pair to compute a ns + * value that is 'roughly' in the right direction, even if we're writing a new + * {offset, mul} pair during the clock read. + * + * The down-side is that we can no longer guarantee strict monotonicity anymore + * (assuming the TSC was that to begin with), because while we compute the + * intersection point of the two clock slopes and make sure the time is + * continuous at the point of switching; we can no longer guarantee a reader is + * strictly before or after the switch point. + * + * It does mean a reader no longer needs to disable IRQs in order to avoid + * CPU-Freq updates messing with his times, and similarly an NMI reader will + * no longer run the risk of hitting half-written state. + */ + +struct cyc2ns { +	struct cyc2ns_data data[2];	/*  0 + 2*24 = 48 */ +	struct cyc2ns_data *head;	/* 48 + 8    = 56 */ +	struct cyc2ns_data *tail;	/* 56 + 8    = 64 */ +}; /* exactly fits one cacheline */ + +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); + +struct cyc2ns_data *cyc2ns_read_begin(void) +{ +	struct cyc2ns_data *head; + +	preempt_disable(); + +	head = this_cpu_read(cyc2ns.head); +	/* +	 * Ensure we observe the entry when we observe the pointer to it. +	 * matches the wmb from cyc2ns_write_end(). +	 */ +	smp_read_barrier_depends(); +	head->__count++; +	barrier(); + +	return head; +} + +void cyc2ns_read_end(struct cyc2ns_data *head) +{ +	barrier(); +	/* +	 * If we're the outer most nested read; update the tail pointer +	 * when we're done. This notifies possible pending writers +	 * that we've observed the head pointer and that the other +	 * entry is now free. +	 */ +	if (!--head->__count) { +		/* +		 * x86-TSO does not reorder writes with older reads; +		 * therefore once this write becomes visible to another +		 * cpu, we must be finished reading the cyc2ns_data. +		 * +		 * matches with cyc2ns_write_begin(). +		 */ +		this_cpu_write(cyc2ns.tail, head); +	} +	preempt_enable(); +} + +/* + * Begin writing a new @data entry for @cpu. + * + * Assumes some sort of write side lock; currently 'provided' by the assumption + * that cpufreq will call its notifiers sequentially. + */ +static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +{ +	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); +	struct cyc2ns_data *data = c2n->data; + +	if (data == c2n->head) +		data++; + +	/* XXX send an IPI to @cpu in order to guarantee a read? */ + +	/* +	 * When we observe the tail write from cyc2ns_read_end(), +	 * the cpu must be done with that entry and its safe +	 * to start writing to it. +	 */ +	while (c2n->tail == data) +		cpu_relax(); + +	return data; +} + +static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +{ +	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + +	/* +	 * Ensure the @data writes are visible before we publish the +	 * entry. Matches the data-depencency in cyc2ns_read_begin(). +	 */ +	smp_wmb(); + +	ACCESS_ONCE(c2n->head) = data; +} + +/* + * Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + *  basic equation: + *              ns = cycles / (freq / ns_per_sec) + *              ns = cycles * (ns_per_sec / freq) + *              ns = cycles * (10^9 / (cpu_khz * 10^3)) + *              ns = cycles * (10^6 / cpu_khz) + * + *      Then we use scaling math (suggested by george@mvista.com) to get: + *              ns = cycles * (10^6 * SC / cpu_khz) / SC + *              ns = cycles * cyc2ns_scale / SC + * + *      And since SC is a constant power of two, we can convert the div + *  into a shift. + * + *  We can use khz divisor instead of mhz to keep a better precision, since + *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + *  (mathieu.desnoyers@polymtl.ca) + * + *                      -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static void cyc2ns_data_init(struct cyc2ns_data *data) +{ +	data->cyc2ns_mul = 0; +	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; +	data->cyc2ns_offset = 0; +	data->__count = 0; +} + +static void cyc2ns_init(int cpu) +{ +	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + +	cyc2ns_data_init(&c2n->data[0]); +	cyc2ns_data_init(&c2n->data[1]); + +	c2n->head = c2n->data; +	c2n->tail = c2n->data; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ +	struct cyc2ns_data *data, *tail; +	unsigned long long ns; + +	/* +	 * See cyc2ns_read_*() for details; replicated in order to avoid +	 * an extra few instructions that came with the abstraction. +	 * Notable, it allows us to only do the __count and tail update +	 * dance when its actually needed. +	 */ + +	preempt_disable_notrace(); +	data = this_cpu_read(cyc2ns.head); +	tail = this_cpu_read(cyc2ns.tail); + +	if (likely(data == tail)) { +		ns = data->cyc2ns_offset; +		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); +	} else { +		data->__count++; + +		barrier(); + +		ns = data->cyc2ns_offset; +		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + +		barrier(); + +		if (!--data->__count) +			this_cpu_write(cyc2ns.tail, data); +	} +	preempt_enable_notrace(); + +	return ns; +} + +/* XXX surely we already have this someplace in the kernel?! */ +#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) + +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +{ +	unsigned long long tsc_now, ns_now; +	struct cyc2ns_data *data; +	unsigned long flags; + +	local_irq_save(flags); +	sched_clock_idle_sleep_event(); + +	if (!cpu_khz) +		goto done; + +	data = cyc2ns_write_begin(cpu); + +	rdtscll(tsc_now); +	ns_now = cycles_2_ns(tsc_now); + +	/* +	 * Compute a new multiplier as per the above comment and ensure our +	 * time function is continuous; see the comment near struct +	 * cyc2ns_data. +	 */ +	data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); +	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; +	data->cyc2ns_offset = ns_now - +		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + +	cyc2ns_write_end(cpu, data); + +done: +	sched_clock_idle_wakeup_event(0); +	local_irq_restore(flags); +}  /*   * Scheduler clock - returns current time in nanosec units.   */  u64 native_sched_clock(void)  { -	u64 this_offset; +	u64 tsc_now;  	/*  	 * Fall back to jiffies if there's no TSC available: @@ -52,16 +285,16 @@ u64 native_sched_clock(void)  	 *   very important for it to be as fast as the platform  	 *   can achieve it. )  	 */ -	if (unlikely(tsc_disabled)) { +	if (!static_key_false(&__use_tsc)) {  		/* No locking but a rare wrong value is not a big deal: */  		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);  	}  	/* read the Time Stamp Counter: */ -	rdtscll(this_offset); +	rdtscll(tsc_now);  	/* return the value in ns */ -	return __cycles_2_ns(this_offset); +	return cycles_2_ns(tsc_now);  }  /* We need to define a real function for sched_clock, to override the @@ -76,17 +309,28 @@ unsigned long long  sched_clock(void) __attribute__((alias("native_sched_clock")));  #endif +unsigned long long native_read_tsc(void) +{ +	return __native_read_tsc(); +} +EXPORT_SYMBOL(native_read_tsc); +  int check_tsc_unstable(void)  {  	return tsc_unstable;  }  EXPORT_SYMBOL_GPL(check_tsc_unstable); +int check_tsc_disabled(void) +{ +	return tsc_disabled; +} +EXPORT_SYMBOL_GPL(check_tsc_disabled); +  #ifdef CONFIG_X86_TSC  int __init notsc_setup(char *str)  { -	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " -			"cannot disable TSC completely.\n"); +	pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n");  	tsc_disabled = 1;  	return 1;  } @@ -179,11 +423,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)  }  #define CAL_MS		10 -#define CAL_LATCH	(CLOCK_TICK_RATE / (1000 / CAL_MS)) +#define CAL_LATCH	(PIT_TICK_RATE / (1000 / CAL_MS))  #define CAL_PIT_LOOPS	1000  #define CAL2_MS		50 -#define CAL2_LATCH	(CLOCK_TICK_RATE / (1000 / CAL2_MS)) +#define CAL2_LATCH	(PIT_TICK_RATE / (1000 / CAL2_MS))  #define CAL2_PIT_LOOPS	5000 @@ -291,14 +535,15 @@ static inline int pit_verify_msb(unsigned char val)  static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)  {  	int count; -	u64 tsc = 0; +	u64 tsc = 0, prev_tsc = 0;  	for (count = 0; count < 50000; count++) {  		if (!pit_verify_msb(val))  			break; +		prev_tsc = tsc;  		tsc = get_cycles();  	} -	*deltap = get_cycles() - tsc; +	*deltap = get_cycles() - prev_tsc;  	*tscp = tsc;  	/* @@ -312,9 +557,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de   * How many MSB values do we want to see? We aim for   * a maximum error rate of 500ppm (in practice the   * real error is much smaller), but refuse to spend - * more than 25ms on it. + * more than 50ms on it.   */ -#define MAX_QUICK_PIT_MS 25 +#define MAX_QUICK_PIT_MS 50  #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)  static unsigned long quick_pit_calibrate(void) @@ -373,7 +618,7 @@ static unsigned long quick_pit_calibrate(void)  			goto success;  		}  	} -	printk("Fast TSC calibration failed\n"); +	pr_err("Fast TSC calibration failed\n");  	return 0;  success: @@ -384,18 +629,15 @@ success:  	 *  	 * As a result, we can depend on there not being  	 * any odd delays anywhere, and the TSC reads are -	 * reliable (within the error). We also adjust the -	 * delta to the middle of the error bars, just -	 * because it looks nicer. +	 * reliable (within the error).  	 *  	 * kHz = ticks / time-in-seconds / 1000;  	 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000  	 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)  	 */ -	delta += (long)(d2 - d1)/2;  	delta *= PIT_TICK_RATE;  	do_div(delta, i*256*1000); -	printk("Fast TSC calibration using PIT\n"); +	pr_info("Fast TSC calibration using PIT\n");  	return delta;  } @@ -409,6 +651,13 @@ unsigned long native_calibrate_tsc(void)  	unsigned long flags, latch, ms, fast_calibrate;  	int hpet = is_hpet_enabled(), i, loopmin; +	/* Calibrate TSC using MSR for Intel Atom SoCs */ +	local_irq_save(flags); +	fast_calibrate = try_msr_calibrate_tsc(); +	local_irq_restore(flags); +	if (fast_calibrate) +		return fast_calibrate; +  	local_irq_save(flags);  	fast_calibrate = quick_pit_calibrate();  	local_irq_restore(flags); @@ -427,7 +676,7 @@ unsigned long native_calibrate_tsc(void)  	 * the delta to the previous read. We keep track of the min  	 * and max values of that delta. The delta is mostly defined  	 * by the IO time of the PIT access, so we can detect when a -	 * SMI/SMM disturbance happend between the two reads. If the +	 * SMI/SMM disturbance happened between the two reads. If the  	 * maximum time is significantly larger than the minimum time,  	 * then we discard the result and have another try.  	 * @@ -464,7 +713,7 @@ unsigned long native_calibrate_tsc(void)  		tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);  		/* hpet or pmtimer available ? */ -		if (!hpet && !ref1 && !ref2) +		if (ref1 == ref2)  			continue;  		/* Check, whether the sampling was disturbed by an SMI */ @@ -490,9 +739,8 @@ unsigned long native_calibrate_tsc(void)  		 * use the reference value, as it is more precise.  		 */  		if (delta >= 90 && delta <= 110) { -			printk(KERN_INFO -			       "TSC: PIT calibration matches %s. %d loops\n", -			       hpet ? "HPET" : "PMTIMER", i + 1); +			pr_info("PIT calibration matches %s. %d loops\n", +				hpet ? "HPET" : "PMTIMER", i + 1);  			return tsc_ref_min;  		} @@ -514,38 +762,36 @@ unsigned long native_calibrate_tsc(void)  	 */  	if (tsc_pit_min == ULONG_MAX) {  		/* PIT gave no useful value */ -		printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); +		pr_warn("Unable to calibrate against PIT\n");  		/* We don't have an alternative source, disable TSC */  		if (!hpet && !ref1 && !ref2) { -			printk("TSC: No reference (HPET/PMTIMER) available\n"); +			pr_notice("No reference (HPET/PMTIMER) available\n");  			return 0;  		}  		/* The alternative source failed as well, disable TSC */  		if (tsc_ref_min == ULONG_MAX) { -			printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " -			       "failed.\n"); +			pr_warn("HPET/PMTIMER calibration failed\n");  			return 0;  		}  		/* Use the alternative source */ -		printk(KERN_INFO "TSC: using %s reference calibration\n", -		       hpet ? "HPET" : "PMTIMER"); +		pr_info("using %s reference calibration\n", +			hpet ? "HPET" : "PMTIMER");  		return tsc_ref_min;  	}  	/* We don't have an alternative source, use the PIT calibration value */  	if (!hpet && !ref1 && !ref2) { -		printk(KERN_INFO "TSC: Using PIT calibration value\n"); +		pr_info("Using PIT calibration value\n");  		return tsc_pit_min;  	}  	/* The alternative source failed, use the PIT calibration value */  	if (tsc_ref_min == ULONG_MAX) { -		printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " -		       "Using PIT calibration\n"); +		pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n");  		return tsc_pit_min;  	} @@ -554,9 +800,9 @@ unsigned long native_calibrate_tsc(void)  	 * the PIT value as we know that there are PMTIMERs around  	 * running at double speed. At least we let the user know:  	 */ -	printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", -	       hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); -	printk(KERN_INFO "TSC: Using PIT calibration value\n"); +	pr_warn("PIT calibration deviates from %s: %lu %lu\n", +		hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); +	pr_info("Using PIT calibration value\n");  	return tsc_pit_min;  } @@ -582,59 +828,11 @@ int recalibrate_cpu_khz(void)  EXPORT_SYMBOL(recalibrate_cpu_khz); -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - *  basic equation: - *              ns = cycles / (freq / ns_per_sec) - *              ns = cycles * (ns_per_sec / freq) - *              ns = cycles * (10^9 / (cpu_khz * 10^3)) - *              ns = cycles * (10^6 / cpu_khz) - * - *      Then we use scaling math (suggested by george@mvista.com) to get: - *              ns = cycles * (10^6 * SC / cpu_khz) / SC - *              ns = cycles * cyc2ns_scale / SC - * - *      And since SC is a constant power of two, we can convert the div - *  into a shift. - * - *  We can use khz divisor instead of mhz to keep a better precision, since - *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - *  (mathieu.desnoyers@polymtl.ca) - * - *                      -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -DEFINE_PER_CPU(unsigned long, cyc2ns); -DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); - -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ -	unsigned long long tsc_now, ns_now, *offset; -	unsigned long flags, *scale; - -	local_irq_save(flags); -	sched_clock_idle_sleep_event(); - -	scale = &per_cpu(cyc2ns, cpu); -	offset = &per_cpu(cyc2ns_offset, cpu); - -	rdtscll(tsc_now); -	ns_now = __cycles_2_ns(tsc_now); - -	if (cpu_khz) { -		*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; -		*offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); -	} - -	sched_clock_idle_wakeup_event(0); -	local_irq_restore(flags); -} -  static unsigned long long cyc2ns_suspend; -void save_sched_clock_state(void) +void tsc_save_sched_clock_state(void)  { -	if (!sched_clock_stable) +	if (!sched_clock_stable())  		return;  	cyc2ns_suspend = sched_clock(); @@ -648,22 +846,32 @@ void save_sched_clock_state(void)   * that sched_clock() continues from the point where it was left off during   * suspend.   */ -void restore_sched_clock_state(void) +void tsc_restore_sched_clock_state(void)  {  	unsigned long long offset;  	unsigned long flags;  	int cpu; -	if (!sched_clock_stable) +	if (!sched_clock_stable())  		return;  	local_irq_save(flags); -	__get_cpu_var(cyc2ns_offset) = 0; +	/* +	 * We're comming out of suspend, there's no concurrency yet; don't +	 * bother being nice about the RCU stuff, just write to both +	 * data fields. +	 */ + +	this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); +	this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); +  	offset = cyc2ns_suspend - sched_clock(); -	for_each_possible_cpu(cpu) -		per_cpu(cyc2ns_offset, cpu) = offset; +	for_each_possible_cpu(cpu) { +		per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; +		per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; +	}  	local_irq_restore(flags);  } @@ -706,16 +914,15 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,  		tsc_khz_ref = tsc_khz;  	}  	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) || -			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || -			(val == CPUFREQ_RESUMECHANGE)) { +			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {  		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);  		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);  		if (!(freq->flags & CPUFREQ_CONST_LOOPS))  			mark_tsc_unstable("cpufreq changes"); -	} -	set_cyc2ns_scale(tsc_khz, freq->cpu); +		set_cyc2ns_scale(tsc_khz, freq->cpu); +	}  	return 0;  } @@ -763,28 +970,10 @@ static cycle_t read_tsc(struct clocksource *cs)  		ret : clocksource_tsc.cycle_last;  } -#ifdef CONFIG_X86_64 -static cycle_t __vsyscall_fn vread_tsc(void) -{ -	cycle_t ret; - -	/* -	 * Surround the RDTSC by barriers, to make sure it's not -	 * speculated to outside the seqlock critical section and -	 * does not cause time warps: -	 */ -	rdtsc_barrier(); -	ret = (cycle_t)vget_cycles(); -	rdtsc_barrier(); - -	return ret >= __vsyscall_gtod_data.clock.cycle_last ? -		ret : __vsyscall_gtod_data.clock.cycle_last; -} -#endif -  static void resume_tsc(struct clocksource *cs)  { -	clocksource_tsc.cycle_last = 0; +	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) +		clocksource_tsc.cycle_last = 0;  }  static struct clocksource clocksource_tsc = { @@ -795,18 +984,16 @@ static struct clocksource clocksource_tsc = {  	.mask                   = CLOCKSOURCE_MASK(64),  	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |  				  CLOCK_SOURCE_MUST_VERIFY, -#ifdef CONFIG_X86_64 -	.vread                  = vread_tsc, -#endif +	.archdata               = { .vclock_mode = VCLOCK_TSC },  };  void mark_tsc_unstable(char *reason)  {  	if (!tsc_unstable) {  		tsc_unstable = 1; -		sched_clock_stable = 0; +		clear_sched_clock_stable();  		disable_sched_clock_irqtime(); -		printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); +		pr_info("Marking TSC unstable due to %s\n", reason);  		/* Change only the rating, when not registered */  		if (clocksource_tsc.mult)  			clocksource_mark_unstable(&clocksource_tsc); @@ -819,27 +1006,6 @@ void mark_tsc_unstable(char *reason)  EXPORT_SYMBOL_GPL(mark_tsc_unstable); -static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d) -{ -	printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", -			d->ident); -	tsc_unstable = 1; -	return 0; -} - -/* List of systems that have known TSC problems */ -static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { -	{ -		.callback = dmi_mark_tsc_unstable, -		.ident = "IBM Thinkpad 380XD", -		.matches = { -			DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), -			DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), -		}, -	}, -	{} -}; -  static void __init check_system_tsc_reliable(void)  {  #ifdef CONFIG_MGEODE_LX @@ -860,7 +1026,7 @@ static void __init check_system_tsc_reliable(void)   * Make an educated guess if the TSC is trustworthy and synchronized   * over all CPUs.   */ -__cpuinit int unsynchronized_tsc(void) +int unsynchronized_tsc(void)  {  	if (!cpu_has_tsc || tsc_unstable)  		return 1; @@ -872,6 +1038,9 @@ __cpuinit int unsynchronized_tsc(void)  	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))  		return 0; + +	if (tsc_clocksource_reliable) +		return 0;  	/*  	 * Intel systems are normally all synchronized.  	 * Exceptions must mark TSC as unstable: @@ -879,14 +1048,92 @@ __cpuinit int unsynchronized_tsc(void)  	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {  		/* assume multi socket systems are not synchronized: */  		if (num_possible_cpus() > 1) -			tsc_unstable = 1; +			return 1;  	} -	return tsc_unstable; +	return 0;  } -static void __init init_tsc_clocksource(void) + +static void tsc_refine_calibration_work(struct work_struct *work); +static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); +/** + * tsc_refine_calibration_work - Further refine tsc freq calibration + * @work - ignored. + * + * This functions uses delayed work over a period of a + * second to further refine the TSC freq value. Since this is + * timer based, instead of loop based, we don't block the boot + * process while this longer calibration is done. + * + * If there are any calibration anomalies (too many SMIs, etc), + * or the refined calibration is off by 1% of the fast early + * calibration, we throw out the new calibration and use the + * early calibration. + */ +static void tsc_refine_calibration_work(struct work_struct *work)  { +	static u64 tsc_start = -1, ref_start; +	static int hpet; +	u64 tsc_stop, ref_stop, delta; +	unsigned long freq; + +	/* Don't bother refining TSC on unstable systems */ +	if (check_tsc_unstable()) +		goto out; + +	/* +	 * Since the work is started early in boot, we may be +	 * delayed the first time we expire. So set the workqueue +	 * again once we know timers are working. +	 */ +	if (tsc_start == -1) { +		/* +		 * Only set hpet once, to avoid mixing hardware +		 * if the hpet becomes enabled later. +		 */ +		hpet = is_hpet_enabled(); +		schedule_delayed_work(&tsc_irqwork, HZ); +		tsc_start = tsc_read_refs(&ref_start, hpet); +		return; +	} + +	tsc_stop = tsc_read_refs(&ref_stop, hpet); + +	/* hpet or pmtimer available ? */ +	if (ref_start == ref_stop) +		goto out; + +	/* Check, whether the sampling was disturbed by an SMI */ +	if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX) +		goto out; + +	delta = tsc_stop - tsc_start; +	delta *= 1000000LL; +	if (hpet) +		freq = calc_hpet_ref(delta, ref_start, ref_stop); +	else +		freq = calc_pmtimer_ref(delta, ref_start, ref_stop); + +	/* Make sure we're within 1% */ +	if (abs(tsc_khz - freq) > tsc_khz/100) +		goto out; + +	tsc_khz = freq; +	pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", +		(unsigned long)tsc_khz / 1000, +		(unsigned long)tsc_khz % 1000); + +out: +	clocksource_register_khz(&clocksource_tsc, tsc_khz); +} + + +static int __init init_tsc_clocksource(void) +{ +	if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz) +		return 0; +  	if (tsc_clocksource_reliable)  		clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;  	/* lower the rating if we already know its unstable: */ @@ -894,8 +1141,27 @@ static void __init init_tsc_clocksource(void)  		clocksource_tsc.rating = 0;  		clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;  	} -	clocksource_register_khz(&clocksource_tsc, tsc_khz); + +	if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) +		clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; + +	/* +	 * Trust the results of the earlier calibration on systems +	 * exporting a reliable TSC. +	 */ +	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { +		clocksource_register_khz(&clocksource_tsc, tsc_khz); +		return 0; +	} + +	schedule_delayed_work(&tsc_irqwork, 0); +	return 0;  } +/* + * We use device_initcall here, to ensure we run after the hpet + * is fully initialized, which may occur at fs_initcall time. + */ +device_initcall(init_tsc_clocksource);  void __init tsc_init(void)  { @@ -915,9 +1181,9 @@ void __init tsc_init(void)  		return;  	} -	printk("Detected %lu.%03lu MHz processor.\n", -			(unsigned long)cpu_khz / 1000, -			(unsigned long)cpu_khz % 1000); +	pr_info("Detected %lu.%03lu MHz processor\n", +		(unsigned long)cpu_khz / 1000, +		(unsigned long)cpu_khz % 1000);  	/*  	 * Secondary CPUs do not run through tsc_init(), so set up @@ -925,14 +1191,18 @@ void __init tsc_init(void)  	 * speed as the bootup CPU. (cpufreq notifiers will fix this  	 * up if their speed diverges)  	 */ -	for_each_possible_cpu(cpu) +	for_each_possible_cpu(cpu) { +		cyc2ns_init(cpu);  		set_cyc2ns_scale(cpu_khz, cpu); +	}  	if (tsc_disabled > 0)  		return;  	/* now allow native_sched_clock() to use rdtsc */ +  	tsc_disabled = 0; +	static_key_slow_inc(&__use_tsc);  	if (!no_sched_irq_time)  		enable_sched_clock_irqtime(); @@ -942,13 +1212,30 @@ void __init tsc_init(void)  	lpj_fine = lpj;  	use_tsc_delay(); -	/* Check and install the TSC clocksource */ -	dmi_check_system(bad_tsc_dmi_table);  	if (unsynchronized_tsc())  		mark_tsc_unstable("TSCs unsynchronized");  	check_system_tsc_reliable(); -	init_tsc_clocksource();  } +#ifdef CONFIG_SMP +/* + * If we have a constant TSC and are using the TSC for the delay loop, + * we can skip clock calibration if another cpu in the same socket has already + * been calibrated. This assumes that CONSTANT_TSC applies to all + * cpus in the socket - this should be a safe assumption. + */ +unsigned long calibrate_delay_is_known(void) +{ +	int i, cpu = smp_processor_id(); + +	if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) +		return 0; + +	for_each_online_cpu(i) +		if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) +			return cpu_data(i).loops_per_jiffy; +	return 0; +} +#endif  | 
