diff options
Diffstat (limited to 'kernel/time')
| -rw-r--r-- | kernel/time/Kconfig | 80 | ||||
| -rw-r--r-- | kernel/time/ntp.c | 105 | ||||
| -rw-r--r-- | kernel/time/ntp_internal.h | 12 | ||||
| -rw-r--r-- | kernel/time/tick-broadcast.c | 242 | ||||
| -rw-r--r-- | kernel/time/tick-common.c | 7 | ||||
| -rw-r--r-- | kernel/time/tick-internal.h | 5 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 300 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 396 | ||||
| -rw-r--r-- | kernel/time/timer_list.c | 104 | 
9 files changed, 994 insertions, 257 deletions
| diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 24510d84efd..e4c07b0692b 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -64,20 +64,88 @@ config GENERIC_CMOS_UPDATE  if GENERIC_CLOCKEVENTS  menu "Timers subsystem" -# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is +# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is  # only related to the tick functionality. Oneshot clockevent devices  # are supported independ of this.  config TICK_ONESHOT  	bool -config NO_HZ -	bool "Tickless System (Dynamic Ticks)" +config NO_HZ_COMMON +	bool  	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS  	select TICK_ONESHOT + +choice +	prompt "Timer tick handling" +	default NO_HZ_IDLE if NO_HZ + +config HZ_PERIODIC +	bool "Periodic timer ticks (constant rate, no dynticks)" +	help +	  This option keeps the tick running periodically at a constant +	  rate, even when the CPU doesn't need it. + +config NO_HZ_IDLE +	bool "Idle dynticks system (tickless idle)" +	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS +	select NO_HZ_COMMON +	help +	  This option enables a tickless idle system: timer interrupts +	  will only trigger on an as-needed basis when the system is idle. +	  This is usually interesting for energy saving. + +	  Most of the time you want to say Y here. + +config NO_HZ_FULL +	bool "Full dynticks system (tickless)" +	# NO_HZ_COMMON dependency +	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS +	# We need at least one periodic CPU for timekeeping +	depends on SMP +	# RCU_USER_QS dependency +	depends on HAVE_CONTEXT_TRACKING +	# VIRT_CPU_ACCOUNTING_GEN dependency +	depends on 64BIT +	select NO_HZ_COMMON +	select RCU_USER_QS +	select RCU_NOCB_CPU +	select VIRT_CPU_ACCOUNTING_GEN +	select CONTEXT_TRACKING_FORCE +	select IRQ_WORK +	help +	 Adaptively try to shutdown the tick whenever possible, even when +	 the CPU is running tasks. Typically this requires running a single +	 task on the CPU. Chances for running tickless are maximized when +	 the task mostly runs in userspace and has few kernel activity. + +	 You need to fill up the nohz_full boot parameter with the +	 desired range of dynticks CPUs. + +	 This is implemented at the expense of some overhead in user <-> kernel +	 transitions: syscalls, exceptions and interrupts. Even when it's +	 dynamically off. + +	 Say N. + +endchoice + +config NO_HZ_FULL_ALL +       bool "Full dynticks system on all CPUs by default" +       depends on NO_HZ_FULL +       help +         If the user doesn't pass the nohz_full boot option to +	 define the range of full dynticks CPUs, consider that all +	 CPUs in the system are full dynticks by default. +	 Note the boot CPU will still be kept outside the range to +	 handle the timekeeping duty. + +config NO_HZ +	bool "Old Idle dynticks config" +	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS  	help -	  This option enables a tickless system: timer interrupts will -	  only trigger on an as-needed basis both when the system is -	  busy and when the system is idle. +	  This is the old config entry that enables dynticks idle. +	  We keep it around for a little while to enforce backward +	  compatibility with older config files.  config HIGH_RES_TIMERS  	bool "High Resolution Timer Support" diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 072bb066bb7..12ff13a838c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -18,13 +18,14 @@  #include <linux/rtc.h>  #include "tick-internal.h" +#include "ntp_internal.h"  /*   * NTP timekeeping variables: + * + * Note: All of the NTP state is protected by the timekeeping locks.   */ -DEFINE_RAW_SPINLOCK(ntp_lock); -  /* USER_HZ period (usecs): */  unsigned long			tick_usec = TICK_USEC; @@ -53,9 +54,6 @@ static int			time_state = TIME_OK;  /* clock status bits:							*/  static int			time_status = STA_UNSYNC; -/* TAI offset (secs):							*/ -static long			time_tai; -  /* time adjustment (nsecs):						*/  static s64			time_offset; @@ -134,8 +132,6 @@ static inline void pps_reset_freq_interval(void)  /**   * pps_clear - Clears the PPS state variables - * - * Must be called while holding a write on the ntp_lock   */  static inline void pps_clear(void)  { @@ -150,8 +146,6 @@ static inline void pps_clear(void)  /* Decrease pps_valid to indicate that another second has passed since   * the last PPS signal. When it reaches 0, indicate that PPS signal is   * missing. - * - * Must be called while holding a write on the ntp_lock   */  static inline void pps_dec_valid(void)  { @@ -346,10 +340,6 @@ static void ntp_update_offset(long offset)   */  void ntp_clear(void)  { -	unsigned long flags; - -	raw_spin_lock_irqsave(&ntp_lock, flags); -  	time_adjust	= 0;		/* stop active adjtime() */  	time_status	|= STA_UNSYNC;  	time_maxerror	= NTP_PHASE_LIMIT; @@ -362,20 +352,12 @@ void ntp_clear(void)  	/* Clear PPS state variables */  	pps_clear(); -	raw_spin_unlock_irqrestore(&ntp_lock, flags); -  }  u64 ntp_tick_length(void)  { -	unsigned long flags; -	s64 ret; - -	raw_spin_lock_irqsave(&ntp_lock, flags); -	ret = tick_length; -	raw_spin_unlock_irqrestore(&ntp_lock, flags); -	return ret; +	return tick_length;  } @@ -393,9 +375,6 @@ int second_overflow(unsigned long secs)  {  	s64 delta;  	int leap = 0; -	unsigned long flags; - -	raw_spin_lock_irqsave(&ntp_lock, flags);  	/*  	 * Leap second processing. If in leap-insert state at the end of the @@ -415,7 +394,6 @@ int second_overflow(unsigned long secs)  		else if (secs % 86400 == 0) {  			leap = -1;  			time_state = TIME_OOP; -			time_tai++;  			printk(KERN_NOTICE  				"Clock: inserting leap second 23:59:60 UTC\n");  		} @@ -425,7 +403,6 @@ int second_overflow(unsigned long secs)  			time_state = TIME_OK;  		else if ((secs + 1) % 86400 == 0) {  			leap = 1; -			time_tai--;  			time_state = TIME_WAIT;  			printk(KERN_NOTICE  				"Clock: deleting leap second 23:59:59 UTC\n"); @@ -479,8 +456,6 @@ int second_overflow(unsigned long secs)  	time_adjust = 0;  out: -	raw_spin_unlock_irqrestore(&ntp_lock, flags); -  	return leap;  } @@ -575,11 +550,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)  	time_status |= txc->status & ~STA_RONLY;  } -/* - * Called with ntp_lock held, so we can access and modify - * all the global NTP state: - */ -static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) + +static inline void process_adjtimex_modes(struct timex *txc, +						struct timespec *ts, +						s32 *time_tai)  {  	if (txc->modes & ADJ_STATUS)  		process_adj_status(txc, ts); @@ -613,7 +587,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts  	}  	if (txc->modes & ADJ_TAI && txc->constant > 0) -		time_tai = txc->constant; +		*time_tai = txc->constant;  	if (txc->modes & ADJ_OFFSET)  		ntp_update_offset(txc->offset); @@ -625,16 +599,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts  		ntp_update_frequency();  } -/* - * adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. + + +/** + * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex   */ -int do_adjtimex(struct timex *txc) +int ntp_validate_timex(struct timex *txc)  { -	struct timespec ts; -	int result; - -	/* Validate the data before disabling interrupts */  	if (txc->modes & ADJ_ADJTIME) {  		/* singleshot must not be used with any other mode bits */  		if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) @@ -646,7 +617,6 @@ int do_adjtimex(struct timex *txc)  		/* In order to modify anything, you gotta be super-user! */  		 if (txc->modes && !capable(CAP_SYS_TIME))  			return -EPERM; -  		/*  		 * if the quartz is off by more than 10% then  		 * something is VERY wrong! @@ -657,22 +627,20 @@ int do_adjtimex(struct timex *txc)  			return -EINVAL;  	} -	if (txc->modes & ADJ_SETOFFSET) { -		struct timespec delta; -		delta.tv_sec  = txc->time.tv_sec; -		delta.tv_nsec = txc->time.tv_usec; -		if (!capable(CAP_SYS_TIME)) -			return -EPERM; -		if (!(txc->modes & ADJ_NANO)) -			delta.tv_nsec *= 1000; -		result = timekeeping_inject_offset(&delta); -		if (result) -			return result; -	} +	if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) +		return -EPERM; -	getnstimeofday(&ts); +	return 0; +} -	raw_spin_lock_irq(&ntp_lock); + +/* + * adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) +{ +	int result;  	if (txc->modes & ADJ_ADJTIME) {  		long save_adjust = time_adjust; @@ -687,7 +655,7 @@ int do_adjtimex(struct timex *txc)  		/* If there are input parameters, then process them: */  		if (txc->modes) -			process_adjtimex_modes(txc, &ts); +			process_adjtimex_modes(txc, ts, time_tai);  		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,  				  NTP_SCALE_SHIFT); @@ -709,15 +677,13 @@ int do_adjtimex(struct timex *txc)  	txc->precision	   = 1;  	txc->tolerance	   = MAXFREQ_SCALED / PPM_SCALE;  	txc->tick	   = tick_usec; -	txc->tai	   = time_tai; +	txc->tai	   = *time_tai;  	/* fill PPS status fields */  	pps_fill_timex(txc); -	raw_spin_unlock_irq(&ntp_lock); - -	txc->time.tv_sec = ts.tv_sec; -	txc->time.tv_usec = ts.tv_nsec; +	txc->time.tv_sec = ts->tv_sec; +	txc->time.tv_usec = ts->tv_nsec;  	if (!(time_status & STA_NANO))  		txc->time.tv_usec /= NSEC_PER_USEC; @@ -894,7 +860,7 @@ static void hardpps_update_phase(long error)  }  /* - * hardpps() - discipline CPU clock oscillator to external PPS signal + * __hardpps() - discipline CPU clock oscillator to external PPS signal   *   * This routine is called at each PPS signal arrival in order to   * discipline the CPU clock oscillator to the PPS signal. It takes two @@ -905,15 +871,13 @@ static void hardpps_update_phase(long error)   * This code is based on David Mills's reference nanokernel   * implementation. It was mostly rewritten but keeps the same idea.   */ -void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  {  	struct pps_normtime pts_norm, freq_norm;  	unsigned long flags;  	pts_norm = pps_normalize_ts(*phase_ts); -	raw_spin_lock_irqsave(&ntp_lock, flags); -  	/* clear the error bits, they will be set again if needed */  	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); @@ -925,7 +889,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  	 * just start the frequency interval */  	if (unlikely(pps_fbase.tv_sec == 0)) {  		pps_fbase = *raw_ts; -		raw_spin_unlock_irqrestore(&ntp_lock, flags);  		return;  	} @@ -940,7 +903,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  		time_status |= STA_PPSJITTER;  		/* restart the frequency calibration interval */  		pps_fbase = *raw_ts; -		raw_spin_unlock_irqrestore(&ntp_lock, flags);  		pr_err("hardpps: PPSJITTER: bad pulse\n");  		return;  	} @@ -957,10 +919,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  	hardpps_update_phase(pts_norm.nsec); -	raw_spin_unlock_irqrestore(&ntp_lock, flags);  } -EXPORT_SYMBOL(hardpps); -  #endif	/* CONFIG_NTP_PPS */  static int __init ntp_tick_adj_setup(char *str) diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h new file mode 100644 index 00000000000..1950cb4ca2a --- /dev/null +++ b/kernel/time/ntp_internal.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_NTP_INTERNAL_H +#define _LINUX_NTP_INTERNAL_H + +extern void ntp_init(void); +extern void ntp_clear(void); +/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ +extern u64 ntp_tick_length(void); +extern int second_overflow(unsigned long secs); +extern int ntp_validate_timex(struct timex *); +extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); +extern void __hardpps(const struct timespec *, const struct timespec *); +#endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 7f32fe0e52c..206bbfb34e0 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -28,9 +28,8 @@   */  static struct tick_device tick_broadcast_device; -/* FIXME: Use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); -static DECLARE_BITMAP(tmpmask, NR_CPUS); +static cpumask_var_t tick_broadcast_mask; +static cpumask_var_t tmpmask;  static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);  static int tick_broadcast_force; @@ -50,7 +49,7 @@ struct tick_device *tick_get_broadcast_device(void)  struct cpumask *tick_get_broadcast_mask(void)  { -	return to_cpumask(tick_broadcast_mask); +	return tick_broadcast_mask;  }  /* @@ -67,6 +66,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)   */  int tick_check_broadcast_device(struct clock_event_device *dev)  { +	struct clock_event_device *cur = tick_broadcast_device.evtdev; +  	if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||  	    (tick_broadcast_device.evtdev &&  	     tick_broadcast_device.evtdev->rating >= dev->rating) || @@ -74,9 +75,21 @@ int tick_check_broadcast_device(struct clock_event_device *dev)  		return 0;  	clockevents_exchange_device(tick_broadcast_device.evtdev, dev); +	if (cur) +		cur->event_handler = clockevents_handle_noop;  	tick_broadcast_device.evtdev = dev; -	if (!cpumask_empty(tick_get_broadcast_mask())) +	if (!cpumask_empty(tick_broadcast_mask))  		tick_broadcast_start_periodic(dev); +	/* +	 * Inform all cpus about this. We might be in a situation +	 * where we did not switch to oneshot mode because the per cpu +	 * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack +	 * of a oneshot capable broadcast device. Without that +	 * notification the systems stays stuck in periodic mode +	 * forever. +	 */ +	if (dev->features & CLOCK_EVT_FEAT_ONESHOT) +		tick_clock_notify();  	return 1;  } @@ -124,7 +137,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)  	if (!tick_device_is_functional(dev)) {  		dev->event_handler = tick_handle_periodic;  		tick_device_setup_broadcast_func(dev); -		cpumask_set_cpu(cpu, tick_get_broadcast_mask()); +		cpumask_set_cpu(cpu, tick_broadcast_mask);  		tick_broadcast_start_periodic(tick_broadcast_device.evtdev);  		ret = 1;  	} else { @@ -135,7 +148,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)  		 */  		if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {  			int cpu = smp_processor_id(); -			cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); +			cpumask_clear_cpu(cpu, tick_broadcast_mask);  			tick_broadcast_clear_oneshot(cpu);  		} else {  			tick_device_setup_broadcast_func(dev); @@ -199,9 +212,8 @@ static void tick_do_periodic_broadcast(void)  {  	raw_spin_lock(&tick_broadcast_lock); -	cpumask_and(to_cpumask(tmpmask), -		    cpu_online_mask, tick_get_broadcast_mask()); -	tick_do_broadcast(to_cpumask(tmpmask)); +	cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); +	tick_do_broadcast(tmpmask);  	raw_spin_unlock(&tick_broadcast_lock);  } @@ -264,13 +276,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)  	if (!tick_device_is_functional(dev))  		goto out; -	bc_stopped = cpumask_empty(tick_get_broadcast_mask()); +	bc_stopped = cpumask_empty(tick_broadcast_mask);  	switch (*reason) {  	case CLOCK_EVT_NOTIFY_BROADCAST_ON:  	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: -		if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { -			cpumask_set_cpu(cpu, tick_get_broadcast_mask()); +		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {  			if (tick_broadcast_device.mode ==  			    TICKDEV_MODE_PERIODIC)  				clockevents_shutdown(dev); @@ -280,8 +291,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)  		break;  	case CLOCK_EVT_NOTIFY_BROADCAST_OFF:  		if (!tick_broadcast_force && -		    cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { -			cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); +		    cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {  			if (tick_broadcast_device.mode ==  			    TICKDEV_MODE_PERIODIC)  				tick_setup_periodic(dev, 0); @@ -289,7 +299,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)  		break;  	} -	if (cpumask_empty(tick_get_broadcast_mask())) { +	if (cpumask_empty(tick_broadcast_mask)) {  		if (!bc_stopped)  			clockevents_shutdown(bc);  	} else if (bc_stopped) { @@ -338,10 +348,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)  	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);  	bc = tick_broadcast_device.evtdev; -	cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); +	cpumask_clear_cpu(cpu, tick_broadcast_mask);  	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { -		if (bc && cpumask_empty(tick_get_broadcast_mask())) +		if (bc && cpumask_empty(tick_broadcast_mask))  			clockevents_shutdown(bc);  	} @@ -377,13 +387,13 @@ int tick_resume_broadcast(void)  		switch (tick_broadcast_device.mode) {  		case TICKDEV_MODE_PERIODIC: -			if (!cpumask_empty(tick_get_broadcast_mask())) +			if (!cpumask_empty(tick_broadcast_mask))  				tick_broadcast_start_periodic(bc);  			broadcast = cpumask_test_cpu(smp_processor_id(), -						     tick_get_broadcast_mask()); +						     tick_broadcast_mask);  			break;  		case TICKDEV_MODE_ONESHOT: -			if (!cpumask_empty(tick_get_broadcast_mask())) +			if (!cpumask_empty(tick_broadcast_mask))  				broadcast = tick_resume_broadcast_oneshot(bc);  			break;  		} @@ -396,25 +406,58 @@ int tick_resume_broadcast(void)  #ifdef CONFIG_TICK_ONESHOT -/* FIXME: use cpumask_var_t. */ -static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); +static cpumask_var_t tick_broadcast_oneshot_mask; +static cpumask_var_t tick_broadcast_pending_mask; +static cpumask_var_t tick_broadcast_force_mask;  /*   * Exposed for debugging: see timer_list.c   */  struct cpumask *tick_get_broadcast_oneshot_mask(void)  { -	return to_cpumask(tick_broadcast_oneshot_mask); +	return tick_broadcast_oneshot_mask;  } -static int tick_broadcast_set_event(ktime_t expires, int force) +/* + * Called before going idle with interrupts disabled. Checks whether a + * broadcast event from the other core is about to happen. We detected + * that in tick_broadcast_oneshot_control(). The callsite can use this + * to avoid a deep idle transition as we are about to get the + * broadcast IPI right away. + */ +int tick_check_broadcast_expired(void)  { -	struct clock_event_device *bc = tick_broadcast_device.evtdev; +	return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask); +} + +/* + * Set broadcast interrupt affinity + */ +static void tick_broadcast_set_affinity(struct clock_event_device *bc, +					const struct cpumask *cpumask) +{ +	if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ)) +		return; + +	if (cpumask_equal(bc->cpumask, cpumask)) +		return; + +	bc->cpumask = cpumask; +	irq_set_affinity(bc->irq, bc->cpumask); +} + +static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, +				    ktime_t expires, int force) +{ +	int ret;  	if (bc->mode != CLOCK_EVT_MODE_ONESHOT)  		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); -	return clockevents_program_event(bc, expires, force); +	ret = clockevents_program_event(bc, expires, force); +	if (!ret) +		tick_broadcast_set_affinity(bc, cpumask_of(cpu)); +	return ret;  }  int tick_resume_broadcast_oneshot(struct clock_event_device *bc) @@ -429,7 +472,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)   */  void tick_check_oneshot_broadcast(int cpu)  { -	if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { +	if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {  		struct tick_device *td = &per_cpu(tick_cpu_device, cpu);  		clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); @@ -443,27 +486,39 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)  {  	struct tick_device *td;  	ktime_t now, next_event; -	int cpu; +	int cpu, next_cpu = 0;  	raw_spin_lock(&tick_broadcast_lock);  again:  	dev->next_event.tv64 = KTIME_MAX;  	next_event.tv64 = KTIME_MAX; -	cpumask_clear(to_cpumask(tmpmask)); +	cpumask_clear(tmpmask);  	now = ktime_get();  	/* Find all expired events */ -	for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { +	for_each_cpu(cpu, tick_broadcast_oneshot_mask) {  		td = &per_cpu(tick_cpu_device, cpu); -		if (td->evtdev->next_event.tv64 <= now.tv64) -			cpumask_set_cpu(cpu, to_cpumask(tmpmask)); -		else if (td->evtdev->next_event.tv64 < next_event.tv64) +		if (td->evtdev->next_event.tv64 <= now.tv64) { +			cpumask_set_cpu(cpu, tmpmask); +			/* +			 * Mark the remote cpu in the pending mask, so +			 * it can avoid reprogramming the cpu local +			 * timer in tick_broadcast_oneshot_control(). +			 */ +			cpumask_set_cpu(cpu, tick_broadcast_pending_mask); +		} else if (td->evtdev->next_event.tv64 < next_event.tv64) {  			next_event.tv64 = td->evtdev->next_event.tv64; +			next_cpu = cpu; +		}  	} +	/* Take care of enforced broadcast requests */ +	cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); +	cpumask_clear(tick_broadcast_force_mask); +  	/*  	 * Wakeup the cpus which have an expired event.  	 */ -	tick_do_broadcast(to_cpumask(tmpmask)); +	tick_do_broadcast(tmpmask);  	/*  	 * Two reasons for reprogram: @@ -480,7 +535,7 @@ again:  		 * Rearm the broadcast device. If event expired,  		 * repeat the above  		 */ -		if (tick_broadcast_set_event(next_event, 0)) +		if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))  			goto again;  	}  	raw_spin_unlock(&tick_broadcast_lock); @@ -495,6 +550,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)  	struct clock_event_device *bc, *dev;  	struct tick_device *td;  	unsigned long flags; +	ktime_t now;  	int cpu;  	/* @@ -519,21 +575,84 @@ void tick_broadcast_oneshot_control(unsigned long reason)  	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);  	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { -		if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { -			cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); +		WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); +		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {  			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); -			if (dev->next_event.tv64 < bc->next_event.tv64) -				tick_broadcast_set_event(dev->next_event, 1); +			/* +			 * We only reprogram the broadcast timer if we +			 * did not mark ourself in the force mask and +			 * if the cpu local event is earlier than the +			 * broadcast event. If the current CPU is in +			 * the force mask, then we are going to be +			 * woken by the IPI right away. +			 */ +			if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && +			    dev->next_event.tv64 < bc->next_event.tv64) +				tick_broadcast_set_event(bc, cpu, dev->next_event, 1);  		}  	} else { -		if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { -			cpumask_clear_cpu(cpu, -					  tick_get_broadcast_oneshot_mask()); +		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {  			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); -			if (dev->next_event.tv64 != KTIME_MAX) -				tick_program_event(dev->next_event, 1); +			if (dev->next_event.tv64 == KTIME_MAX) +				goto out; +			/* +			 * The cpu which was handling the broadcast +			 * timer marked this cpu in the broadcast +			 * pending mask and fired the broadcast +			 * IPI. So we are going to handle the expired +			 * event anyway via the broadcast IPI +			 * handler. No need to reprogram the timer +			 * with an already expired event. +			 */ +			if (cpumask_test_and_clear_cpu(cpu, +				       tick_broadcast_pending_mask)) +				goto out; + +			/* +			 * If the pending bit is not set, then we are +			 * either the CPU handling the broadcast +			 * interrupt or we got woken by something else. +			 * +			 * We are not longer in the broadcast mask, so +			 * if the cpu local expiry time is already +			 * reached, we would reprogram the cpu local +			 * timer with an already expired event. +			 * +			 * This can lead to a ping-pong when we return +			 * to idle and therefor rearm the broadcast +			 * timer before the cpu local timer was able +			 * to fire. This happens because the forced +			 * reprogramming makes sure that the event +			 * will happen in the future and depending on +			 * the min_delta setting this might be far +			 * enough out that the ping-pong starts. +			 * +			 * If the cpu local next_event has expired +			 * then we know that the broadcast timer +			 * next_event has expired as well and +			 * broadcast is about to be handled. So we +			 * avoid reprogramming and enforce that the +			 * broadcast handler, which did not run yet, +			 * will invoke the cpu local handler. +			 * +			 * We cannot call the handler directly from +			 * here, because we might be in a NOHZ phase +			 * and we did not go through the irq_enter() +			 * nohz fixups. +			 */ +			now = ktime_get(); +			if (dev->next_event.tv64 <= now.tv64) { +				cpumask_set_cpu(cpu, tick_broadcast_force_mask); +				goto out; +			} +			/* +			 * We got woken by something else. Reprogram +			 * the cpu local timer device. +			 */ +			tick_program_event(dev->next_event, 1);  		}  	} +out:  	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);  } @@ -544,7 +663,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)   */  static void tick_broadcast_clear_oneshot(int cpu)  { -	cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); +	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);  }  static void tick_broadcast_init_next_event(struct cpumask *mask, @@ -574,7 +693,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)  		bc->event_handler = tick_handle_oneshot_broadcast;  		/* Take the do_timer update */ -		tick_do_timer_cpu = cpu; +		if (!tick_nohz_full_cpu(cpu)) +			tick_do_timer_cpu = cpu;  		/*  		 * We must be careful here. There might be other CPUs @@ -582,17 +702,16 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)  		 * oneshot_mask bits for those and program the  		 * broadcast device to fire.  		 */ -		cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); -		cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); -		cpumask_or(tick_get_broadcast_oneshot_mask(), -			   tick_get_broadcast_oneshot_mask(), -			   to_cpumask(tmpmask)); +		cpumask_copy(tmpmask, tick_broadcast_mask); +		cpumask_clear_cpu(cpu, tmpmask); +		cpumask_or(tick_broadcast_oneshot_mask, +			   tick_broadcast_oneshot_mask, tmpmask); -		if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { +		if (was_periodic && !cpumask_empty(tmpmask)) {  			clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); -			tick_broadcast_init_next_event(to_cpumask(tmpmask), +			tick_broadcast_init_next_event(tmpmask,  						       tick_next_period); -			tick_broadcast_set_event(tick_next_period, 1); +			tick_broadcast_set_event(bc, cpu, tick_next_period, 1);  		} else  			bc->next_event.tv64 = KTIME_MAX;  	} else { @@ -640,7 +759,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)  	 * Clear the broadcast mask flag for the dead cpu, but do not  	 * stop the broadcast device!  	 */ -	cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); +	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);  	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);  } @@ -664,3 +783,14 @@ bool tick_broadcast_oneshot_available(void)  }  #endif + +void __init tick_broadcast_init(void) +{ +	alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); +	alloc_cpumask_var(&tmpmask, GFP_NOWAIT); +#ifdef CONFIG_TICK_ONESHOT +	alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); +	alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT); +	alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT); +#endif +} diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b1600a6973f..5d3fb100bc0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td,  		 * this cpu:  		 */  		if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { -			tick_do_timer_cpu = cpu; +			if (!tick_nohz_full_cpu(cpu)) +				tick_do_timer_cpu = cpu; +			else +				tick_do_timer_cpu = TICK_DO_TIMER_NONE;  			tick_next_period = ktime_get();  			tick_period = ktime_set(0, NSEC_PER_SEC / HZ);  		} @@ -323,6 +326,7 @@ static void tick_shutdown(unsigned int *cpup)  		 */  		dev->mode = CLOCK_EVT_MODE_UNUSED;  		clockevents_exchange_device(dev, NULL); +		dev->event_handler = clockevents_handle_noop;  		td->evtdev = NULL;  	}  	raw_spin_unlock_irqrestore(&tick_device_lock, flags); @@ -416,4 +420,5 @@ static struct notifier_block tick_notifier = {  void __init tick_init(void)  {  	clockevents_register_notifier(&tick_notifier); +	tick_broadcast_init();  } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index cf3e59ed6dc..f0299eae460 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,6 +4,8 @@  #include <linux/hrtimer.h>  #include <linux/tick.h> +extern seqlock_t jiffies_lock; +  #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD  #define TICK_DO_TIMER_NONE	-1 @@ -94,7 +96,7 @@ extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);  extern void tick_shutdown_broadcast(unsigned int *cpup);  extern void tick_suspend_broadcast(void);  extern int tick_resume_broadcast(void); - +extern void tick_broadcast_init(void);  extern void  tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); @@ -119,6 +121,7 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }  static inline void tick_shutdown_broadcast(unsigned int *cpup) { }  static inline void tick_suspend_broadcast(void) { }  static inline int tick_resume_broadcast(void) { return 0; } +static inline void tick_broadcast_init(void) { }  /*   * Set the periodic handler in non broadcast mode diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a19a39952c1..bc67d4245e1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -21,11 +21,15 @@  #include <linux/sched.h>  #include <linux/module.h>  #include <linux/irq_work.h> +#include <linux/posix-timers.h> +#include <linux/perf_event.h>  #include <asm/irq_regs.h>  #include "tick-internal.h" +#include <trace/events/timer.h> +  /*   * Per cpu nohz control structure   */ @@ -104,7 +108,7 @@ static void tick_sched_do_timer(ktime_t now)  {  	int cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  	/*  	 * Check if the do_timer duty was dropped. We don't care about  	 * concurrency: This happens only when the cpu in charge went @@ -112,7 +116,8 @@ static void tick_sched_do_timer(ktime_t now)  	 * this duty, then the jiffies update is still serialized by  	 * jiffies_lock.  	 */ -	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) +	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) +	    && !tick_nohz_full_cpu(cpu))  		tick_do_timer_cpu = cpu;  #endif @@ -123,7 +128,7 @@ static void tick_sched_do_timer(ktime_t now)  static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)  { -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  	/*  	 * When we are idle and the tick is stopped, we have to touch  	 * the watchdog as we might not schedule for a really long @@ -142,10 +147,226 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)  	profile_tick(CPU_PROFILING);  } +#ifdef CONFIG_NO_HZ_FULL +static cpumask_var_t nohz_full_mask; +bool have_nohz_full_mask; + +static bool can_stop_full_tick(void) +{ +	WARN_ON_ONCE(!irqs_disabled()); + +	if (!sched_can_stop_tick()) { +		trace_tick_stop(0, "more than 1 task in runqueue\n"); +		return false; +	} + +	if (!posix_cpu_timers_can_stop_tick(current)) { +		trace_tick_stop(0, "posix timers running\n"); +		return false; +	} + +	if (!perf_event_can_stop_tick()) { +		trace_tick_stop(0, "perf events running\n"); +		return false; +	} + +	/* sched_clock_tick() needs us? */ +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +	/* +	 * TODO: kick full dynticks CPUs when +	 * sched_clock_stable is set. +	 */ +	if (!sched_clock_stable) { +		trace_tick_stop(0, "unstable sched clock\n"); +		return false; +	} +#endif + +	return true; +} + +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); + +/* + * Re-evaluate the need for the tick on the current CPU + * and restart it if necessary. + */ +void tick_nohz_full_check(void) +{ +	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + +	if (tick_nohz_full_cpu(smp_processor_id())) { +		if (ts->tick_stopped && !is_idle_task(current)) { +			if (!can_stop_full_tick()) +				tick_nohz_restart_sched_tick(ts, ktime_get()); +		} +	} +} + +static void nohz_full_kick_work_func(struct irq_work *work) +{ +	tick_nohz_full_check(); +} + +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { +	.func = nohz_full_kick_work_func, +}; + +/* + * Kick the current CPU if it's full dynticks in order to force it to + * re-evaluate its dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick(void) +{ +	if (tick_nohz_full_cpu(smp_processor_id())) +		irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +} + +static void nohz_full_kick_ipi(void *info) +{ +	tick_nohz_full_check(); +} + +/* + * Kick all full dynticks CPUs in order to force these to re-evaluate + * their dependency on the tick and restart it if necessary. + */ +void tick_nohz_full_kick_all(void) +{ +	if (!have_nohz_full_mask) +		return; + +	preempt_disable(); +	smp_call_function_many(nohz_full_mask, +			       nohz_full_kick_ipi, NULL, false); +	preempt_enable(); +} + +/* + * Re-evaluate the need for the tick as we switch the current task. + * It might need the tick due to per task/process properties: + * perf events, posix cpu timers, ... + */ +void tick_nohz_task_switch(struct task_struct *tsk) +{ +	unsigned long flags; + +	local_irq_save(flags); + +	if (!tick_nohz_full_cpu(smp_processor_id())) +		goto out; + +	if (tick_nohz_tick_stopped() && !can_stop_full_tick()) +		tick_nohz_full_kick(); + +out: +	local_irq_restore(flags); +} + +int tick_nohz_full_cpu(int cpu) +{ +	if (!have_nohz_full_mask) +		return 0; + +	return cpumask_test_cpu(cpu, nohz_full_mask); +} + +/* Parse the boot-time nohz CPU list from the kernel parameters. */ +static int __init tick_nohz_full_setup(char *str) +{ +	int cpu; + +	alloc_bootmem_cpumask_var(&nohz_full_mask); +	if (cpulist_parse(str, nohz_full_mask) < 0) { +		pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); +		return 1; +	} + +	cpu = smp_processor_id(); +	if (cpumask_test_cpu(cpu, nohz_full_mask)) { +		pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); +		cpumask_clear_cpu(cpu, nohz_full_mask); +	} +	have_nohz_full_mask = true; + +	return 1; +} +__setup("nohz_full=", tick_nohz_full_setup); + +static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, +						 unsigned long action, +						 void *hcpu) +{ +	unsigned int cpu = (unsigned long)hcpu; + +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_DOWN_PREPARE: +		/* +		 * If we handle the timekeeping duty for full dynticks CPUs, +		 * we can't safely shutdown that CPU. +		 */ +		if (have_nohz_full_mask && tick_do_timer_cpu == cpu) +			return -EINVAL; +		break; +	} +	return NOTIFY_OK; +} + +/* + * Worst case string length in chunks of CPU range seems 2 steps + * separations: 0,2,4,6,... + * This is NR_CPUS + sizeof('\0') + */ +static char __initdata nohz_full_buf[NR_CPUS + 1]; + +static int tick_nohz_init_all(void) +{ +	int err = -1; + +#ifdef CONFIG_NO_HZ_FULL_ALL +	if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { +		pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); +		return err; +	} +	err = 0; +	cpumask_setall(nohz_full_mask); +	cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); +	have_nohz_full_mask = true; +#endif +	return err; +} + +void __init tick_nohz_init(void) +{ +	int cpu; + +	if (!have_nohz_full_mask) { +		if (tick_nohz_init_all() < 0) +			return; +	} + +	cpu_notifier(tick_nohz_cpu_down_callback, 0); + +	/* Make sure full dynticks CPU are also RCU nocbs */ +	for_each_cpu(cpu, nohz_full_mask) { +		if (!rcu_is_nocb_cpu(cpu)) { +			pr_warning("NO_HZ: CPU %d is not RCU nocb: " +				   "cleared from nohz_full range", cpu); +			cpumask_clear_cpu(cpu, nohz_full_mask); +		} +	} + +	cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); +	pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); +} +#else +#define have_nohz_full_mask (0) +#endif +  /*   * NOHZ - aka dynamic tick functionality   */ -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  /*   * NO HZ enabled ?   */ @@ -345,11 +566,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,  			delta_jiffies = rcu_delta_jiffies;  		}  	} +  	/* -	 * Do not stop the tick, if we are only one off -	 * or if the cpu is required for rcu +	 * Do not stop the tick, if we are only one off (or less) +	 * or if the cpu is required for RCU:  	 */ -	if (!ts->tick_stopped && delta_jiffies == 1) +	if (!ts->tick_stopped && delta_jiffies <= 1)  		goto out;  	/* Schedule the tick, if we are at least one jiffie off */ @@ -378,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,  			time_delta = KTIME_MAX;  		} +#ifdef CONFIG_NO_HZ_FULL +		if (!ts->inidle) { +			time_delta = min(time_delta, +					 scheduler_tick_max_deferment()); +		} +#endif +  		/*  		 * calculate the expiry time for the next timer wheel  		 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals @@ -421,6 +650,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,  			ts->last_tick = hrtimer_get_expires(&ts->sched_timer);  			ts->tick_stopped = 1; +			trace_tick_stop(1, " ");  		}  		/* @@ -457,6 +687,24 @@ out:  	return ret;  } +static void tick_nohz_full_stop_tick(struct tick_sched *ts) +{ +#ifdef CONFIG_NO_HZ_FULL +       int cpu = smp_processor_id(); + +       if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) +               return; + +       if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) +	       return; + +       if (!can_stop_full_tick()) +               return; + +       tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); +#endif +} +  static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)  {  	/* @@ -482,13 +730,28 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)  		if (ratelimit < 10 &&  		    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { -			printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", -			       (unsigned int) local_softirq_pending()); +			pr_warn("NOHZ: local_softirq_pending %02x\n", +				(unsigned int) local_softirq_pending());  			ratelimit++;  		}  		return false;  	} +	if (have_nohz_full_mask) { +		/* +		 * Keep the tick alive to guarantee timekeeping progression +		 * if there are full dynticks CPUs around +		 */ +		if (tick_do_timer_cpu == cpu) +			return false; +		/* +		 * Boot safety: make sure the timekeeping duty has been +		 * assigned before entering dyntick-idle mode, +		 */ +		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) +			return false; +	} +  	return true;  } @@ -568,12 +831,13 @@ void tick_nohz_irq_exit(void)  {  	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); -	if (!ts->inidle) -		return; - -	/* Cancel the timer because CPU already waken up from the C-states*/ -	menu_hrtimer_cancel(); -	__tick_nohz_idle_enter(ts); +	if (ts->inidle) { +		/* Cancel the timer because CPU already waken up from the C-states*/ +		menu_hrtimer_cancel(); +		__tick_nohz_idle_enter(ts); +	} else { +		tick_nohz_full_stop_tick(ts); +	}  }  /** @@ -802,7 +1066,7 @@ static inline void tick_check_nohz(int cpu)  static inline void tick_nohz_switch_to_nohz(void) { }  static inline void tick_check_nohz(int cpu) { } -#endif /* NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */  /*   * Called from irq_enter to notify about the possible interruption of idle() @@ -887,14 +1151,14 @@ void tick_setup_sched_timer(void)  		now = ktime_get();  	} -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON  	if (tick_nohz_enabled)  		ts->nohz_mode = NOHZ_MODE_HIGHRES;  #endif  }  #endif /* HIGH_RES_TIMERS */ -#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS  void tick_cancel_sched_timer(int cpu)  {  	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9a0bc98fbe1..98cd470bbe4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -23,8 +23,13 @@  #include <linux/stop_machine.h>  #include <linux/pvclock_gtod.h> +#include "tick-internal.h" +#include "ntp_internal.h"  static struct timekeeper timekeeper; +static DEFINE_RAW_SPINLOCK(timekeeper_lock); +static seqcount_t timekeeper_seq; +static struct timekeeper shadow_timekeeper;  /* flag for if timekeeping is suspended */  int __read_mostly timekeeping_suspended; @@ -67,6 +72,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)  	tk->wall_to_monotonic = wtm;  	set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);  	tk->offs_real = timespec_to_ktime(tmp); +	tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0));  }  static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) @@ -96,7 +102,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)  	old_clock = tk->clock;  	tk->clock = clock; -	clock->cycle_last = clock->read(clock); +	tk->cycle_last = clock->cycle_last = clock->read(clock);  	/* Do the ns -> cycle conversion first, using original mult */  	tmp = NTP_INTERVAL_LENGTH; @@ -201,8 +207,6 @@ static void update_pvclock_gtod(struct timekeeper *tk)  /**   * pvclock_gtod_register_notifier - register a pvclock timedata update listener - * - * Must hold write on timekeeper.lock   */  int pvclock_gtod_register_notifier(struct notifier_block *nb)  { @@ -210,11 +214,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)  	unsigned long flags;  	int ret; -	write_seqlock_irqsave(&tk->lock, flags); +	raw_spin_lock_irqsave(&timekeeper_lock, flags);  	ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); -	/* update timekeeping data */  	update_pvclock_gtod(tk); -	write_sequnlock_irqrestore(&tk->lock, flags); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	return ret;  } @@ -223,25 +226,22 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);  /**   * pvclock_gtod_unregister_notifier - unregister a pvclock   * timedata update listener - * - * Must hold write on timekeeper.lock   */  int pvclock_gtod_unregister_notifier(struct notifier_block *nb)  { -	struct timekeeper *tk = &timekeeper;  	unsigned long flags;  	int ret; -	write_seqlock_irqsave(&tk->lock, flags); +	raw_spin_lock_irqsave(&timekeeper_lock, flags);  	ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); -	write_sequnlock_irqrestore(&tk->lock, flags); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	return ret;  }  EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); -/* must hold write on timekeeper.lock */ -static void timekeeping_update(struct timekeeper *tk, bool clearntp) +/* must hold timekeeper_lock */ +static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror)  {  	if (clearntp) {  		tk->ntp_error = 0; @@ -249,6 +249,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)  	}  	update_vsyscall(tk);  	update_pvclock_gtod(tk); + +	if (mirror) +		memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));  }  /** @@ -267,7 +270,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)  	clock = tk->clock;  	cycle_now = clock->read(clock);  	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; -	clock->cycle_last = cycle_now; +	tk->cycle_last = clock->cycle_last = cycle_now;  	tk->xtime_nsec += cycle_delta * tk->mult; @@ -294,12 +297,12 @@ int __getnstimeofday(struct timespec *ts)  	s64 nsecs = 0;  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		ts->tv_sec = tk->xtime_sec;  		nsecs = timekeeping_get_ns(tk); -	} while (read_seqretry(&tk->lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	ts->tv_nsec = 0;  	timespec_add_ns(ts, nsecs); @@ -335,11 +338,11 @@ ktime_t ktime_get(void)  	WARN_ON(timekeeping_suspended);  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;  		nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; -	} while (read_seqretry(&tk->lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	/*  	 * Use ktime_set/ktime_add_ns to create a proper ktime on  	 * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -366,12 +369,12 @@ void ktime_get_ts(struct timespec *ts)  	WARN_ON(timekeeping_suspended);  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		ts->tv_sec = tk->xtime_sec;  		nsec = timekeeping_get_ns(tk);  		tomono = tk->wall_to_monotonic; -	} while (read_seqretry(&tk->lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	ts->tv_sec += tomono.tv_sec;  	ts->tv_nsec = 0; @@ -379,6 +382,50 @@ void ktime_get_ts(struct timespec *ts)  }  EXPORT_SYMBOL_GPL(ktime_get_ts); + +/** + * timekeeping_clocktai - Returns the TAI time of day in a timespec + * @ts:		pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void timekeeping_clocktai(struct timespec *ts) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned long seq; +	u64 nsecs; + +	WARN_ON(timekeeping_suspended); + +	do { +		seq = read_seqcount_begin(&timekeeper_seq); + +		ts->tv_sec = tk->xtime_sec + tk->tai_offset; +		nsecs = timekeeping_get_ns(tk); + +	} while (read_seqcount_retry(&timekeeper_seq, seq)); + +	ts->tv_nsec = 0; +	timespec_add_ns(ts, nsecs); + +} +EXPORT_SYMBOL(timekeeping_clocktai); + + +/** + * ktime_get_clocktai - Returns the TAI time of day in a ktime + * + * Returns the time of day in a ktime. + */ +ktime_t ktime_get_clocktai(void) +{ +	struct timespec ts; + +	timekeeping_clocktai(&ts); +	return timespec_to_ktime(ts); +} +EXPORT_SYMBOL(ktime_get_clocktai); +  #ifdef CONFIG_NTP_PPS  /** @@ -399,7 +446,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)  	WARN_ON_ONCE(timekeeping_suspended);  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		*ts_raw = tk->raw_time;  		ts_real->tv_sec = tk->xtime_sec; @@ -408,7 +455,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)  		nsecs_raw = timekeeping_get_ns_raw(tk);  		nsecs_real = timekeeping_get_ns(tk); -	} while (read_seqretry(&tk->lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	timespec_add_ns(ts_raw, nsecs_raw);  	timespec_add_ns(ts_real, nsecs_real); @@ -448,7 +495,8 @@ int do_settimeofday(const struct timespec *tv)  	if (!timespec_valid_strict(tv))  		return -EINVAL; -	write_seqlock_irqsave(&tk->lock, flags); +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq);  	timekeeping_forward_now(tk); @@ -460,9 +508,10 @@ int do_settimeofday(const struct timespec *tv)  	tk_set_xtime(tk, tv); -	timekeeping_update(tk, true); +	timekeeping_update(tk, true, true); -	write_sequnlock_irqrestore(&tk->lock, flags); +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	/* signal hrtimers about time change */  	clock_was_set(); @@ -487,7 +536,8 @@ int timekeeping_inject_offset(struct timespec *ts)  	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)  		return -EINVAL; -	write_seqlock_irqsave(&tk->lock, flags); +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq);  	timekeeping_forward_now(tk); @@ -502,9 +552,10 @@ int timekeeping_inject_offset(struct timespec *ts)  	tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));  error: /* even if we error out, we forwarded the time, so call update */ -	timekeeping_update(tk, true); +	timekeeping_update(tk, true, true); -	write_sequnlock_irqrestore(&tk->lock, flags); +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	/* signal hrtimers about time change */  	clock_was_set(); @@ -513,6 +564,52 @@ error: /* even if we error out, we forwarded the time, so call update */  }  EXPORT_SYMBOL(timekeeping_inject_offset); + +/** + * timekeeping_get_tai_offset - Returns current TAI offset from UTC + * + */ +s32 timekeeping_get_tai_offset(void) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned int seq; +	s32 ret; + +	do { +		seq = read_seqcount_begin(&timekeeper_seq); +		ret = tk->tai_offset; +	} while (read_seqcount_retry(&timekeeper_seq, seq)); + +	return ret; +} + +/** + * __timekeeping_set_tai_offset - Lock free worker function + * + */ +static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) +{ +	tk->tai_offset = tai_offset; +	tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0)); +} + +/** + * timekeeping_set_tai_offset - Sets the current TAI offset from UTC + * + */ +void timekeeping_set_tai_offset(s32 tai_offset) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned long flags; + +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); +	__timekeeping_set_tai_offset(tk, tai_offset); +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +	clock_was_set(); +} +  /**   * change_clocksource - Swaps clocksources if a new one is available   * @@ -526,7 +623,8 @@ static int change_clocksource(void *data)  	new = (struct clocksource *) data; -	write_seqlock_irqsave(&tk->lock, flags); +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq);  	timekeeping_forward_now(tk);  	if (!new->enable || new->enable(new) == 0) { @@ -535,9 +633,10 @@ static int change_clocksource(void *data)  		if (old->disable)  			old->disable(old);  	} -	timekeeping_update(tk, true); +	timekeeping_update(tk, true, true); -	write_sequnlock_irqrestore(&tk->lock, flags); +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	return 0;  } @@ -587,11 +686,11 @@ void getrawmonotonic(struct timespec *ts)  	s64 nsecs;  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		nsecs = timekeeping_get_ns_raw(tk);  		*ts = tk->raw_time; -	} while (read_seqretry(&tk->lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	timespec_add_ns(ts, nsecs);  } @@ -607,11 +706,11 @@ int timekeeping_valid_for_hres(void)  	int ret;  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; -	} while (read_seqretry(&tk->lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	return ret;  } @@ -626,11 +725,11 @@ u64 timekeeping_max_deferment(void)  	u64 ret;  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		ret = tk->clock->max_idle_ns; -	} while (read_seqretry(&tk->lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	return ret;  } @@ -693,11 +792,10 @@ void __init timekeeping_init(void)  		boot.tv_nsec = 0;  	} -	seqlock_init(&tk->lock); - +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq);  	ntp_init(); -	write_seqlock_irqsave(&tk->lock, flags);  	clock = clocksource_default_clock();  	if (clock->enable)  		clock->enable(clock); @@ -716,7 +814,10 @@ void __init timekeeping_init(void)  	tmp.tv_nsec = 0;  	tk_set_sleep_time(tk, tmp); -	write_sequnlock_irqrestore(&tk->lock, flags); +	memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); + +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  }  /* time in seconds when suspend began */ @@ -764,15 +865,17 @@ void timekeeping_inject_sleeptime(struct timespec *delta)  	if (has_persistent_clock())  		return; -	write_seqlock_irqsave(&tk->lock, flags); +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq);  	timekeeping_forward_now(tk);  	__timekeeping_inject_sleeptime(tk, delta); -	timekeeping_update(tk, true); +	timekeeping_update(tk, true, true); -	write_sequnlock_irqrestore(&tk->lock, flags); +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	/* signal hrtimers about time change */  	clock_was_set(); @@ -788,26 +891,72 @@ void timekeeping_inject_sleeptime(struct timespec *delta)  static void timekeeping_resume(void)  {  	struct timekeeper *tk = &timekeeper; +	struct clocksource *clock = tk->clock;  	unsigned long flags; -	struct timespec ts; +	struct timespec ts_new, ts_delta; +	cycle_t cycle_now, cycle_delta; +	bool suspendtime_found = false; -	read_persistent_clock(&ts); +	read_persistent_clock(&ts_new);  	clockevents_resume();  	clocksource_resume(); -	write_seqlock_irqsave(&tk->lock, flags); +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); + +	/* +	 * After system resumes, we need to calculate the suspended time and +	 * compensate it for the OS time. There are 3 sources that could be +	 * used: Nonstop clocksource during suspend, persistent clock and rtc +	 * device. +	 * +	 * One specific platform may have 1 or 2 or all of them, and the +	 * preference will be: +	 *	suspend-nonstop clocksource -> persistent clock -> rtc +	 * The less preferred source will only be tried if there is no better +	 * usable source. The rtc part is handled separately in rtc core code. +	 */ +	cycle_now = clock->read(clock); +	if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && +		cycle_now > clock->cycle_last) { +		u64 num, max = ULLONG_MAX; +		u32 mult = clock->mult; +		u32 shift = clock->shift; +		s64 nsec = 0; + +		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; -	if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { -		ts = timespec_sub(ts, timekeeping_suspend_time); -		__timekeeping_inject_sleeptime(tk, &ts); +		/* +		 * "cycle_delta * mutl" may cause 64 bits overflow, if the +		 * suspended time is too long. In that case we need do the +		 * 64 bits math carefully +		 */ +		do_div(max, mult); +		if (cycle_delta > max) { +			num = div64_u64(cycle_delta, max); +			nsec = (((u64) max * mult) >> shift) * num; +			cycle_delta -= num * max; +		} +		nsec += ((u64) cycle_delta * mult) >> shift; + +		ts_delta = ns_to_timespec(nsec); +		suspendtime_found = true; +	} else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { +		ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); +		suspendtime_found = true;  	} -	/* re-base the last cycle value */ -	tk->clock->cycle_last = tk->clock->read(tk->clock); + +	if (suspendtime_found) +		__timekeeping_inject_sleeptime(tk, &ts_delta); + +	/* Re-base the last cycle value */ +	tk->cycle_last = clock->cycle_last = cycle_now;  	tk->ntp_error = 0;  	timekeeping_suspended = 0; -	timekeeping_update(tk, false); -	write_sequnlock_irqrestore(&tk->lock, flags); +	timekeeping_update(tk, false, true); +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	touch_softlockup_watchdog(); @@ -826,7 +975,8 @@ static int timekeeping_suspend(void)  	read_persistent_clock(&timekeeping_suspend_time); -	write_seqlock_irqsave(&tk->lock, flags); +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq);  	timekeeping_forward_now(tk);  	timekeeping_suspended = 1; @@ -849,7 +999,8 @@ static int timekeeping_suspend(void)  		timekeeping_suspend_time =  			timespec_add(timekeeping_suspend_time, delta_delta);  	} -	write_sequnlock_irqrestore(&tk->lock, flags); +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);  	clocksource_suspend(); @@ -1099,6 +1250,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)  			tk_set_wall_to_mono(tk,  				timespec_sub(tk->wall_to_monotonic, ts)); +			__timekeeping_set_tai_offset(tk, tk->tai_offset - leap); +  			clock_was_set_delayed();  		}  	} @@ -1116,15 +1269,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)  static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,  						u32 shift)  { +	cycle_t interval = tk->cycle_interval << shift;  	u64 raw_nsecs;  	/* If the offset is smaller then a shifted interval, do nothing */ -	if (offset < tk->cycle_interval<<shift) +	if (offset < interval)  		return offset;  	/* Accumulate one shifted interval */ -	offset -= tk->cycle_interval << shift; -	tk->clock->cycle_last += tk->cycle_interval << shift; +	offset -= interval; +	tk->cycle_last += interval;  	tk->xtime_nsec += tk->xtime_interval << shift;  	accumulate_nsecs_to_secs(tk); @@ -1181,27 +1335,28 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)  static void update_wall_time(void)  {  	struct clocksource *clock; -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *real_tk = &timekeeper; +	struct timekeeper *tk = &shadow_timekeeper;  	cycle_t offset;  	int shift = 0, maxshift;  	unsigned long flags; -	write_seqlock_irqsave(&tk->lock, flags); +	raw_spin_lock_irqsave(&timekeeper_lock, flags);  	/* Make sure we're fully resumed: */  	if (unlikely(timekeeping_suspended))  		goto out; -	clock = tk->clock; +	clock = real_tk->clock;  #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET -	offset = tk->cycle_interval; +	offset = real_tk->cycle_interval;  #else  	offset = (clock->read(clock) - clock->cycle_last) & clock->mask;  #endif  	/* Check if there's really nothing to do */ -	if (offset < tk->cycle_interval) +	if (offset < real_tk->cycle_interval)  		goto out;  	/* @@ -1238,11 +1393,24 @@ static void update_wall_time(void)  	 */  	accumulate_nsecs_to_secs(tk); -	timekeeping_update(tk, false); - +	write_seqcount_begin(&timekeeper_seq); +	/* Update clock->cycle_last with the new value */ +	clock->cycle_last = tk->cycle_last; +	/* +	 * Update the real timekeeper. +	 * +	 * We could avoid this memcpy by switching pointers, but that +	 * requires changes to all other timekeeper usage sites as +	 * well, i.e. move the timekeeper pointer getter into the +	 * spinlocked/seqcount protected sections. And we trade this +	 * memcpy under the timekeeper_seq against one before we start +	 * updating. +	 */ +	memcpy(real_tk, tk, sizeof(*tk)); +	timekeeping_update(real_tk, false, false); +	write_seqcount_end(&timekeeper_seq);  out: -	write_sequnlock_irqrestore(&tk->lock, flags); - +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  }  /** @@ -1289,13 +1457,13 @@ void get_monotonic_boottime(struct timespec *ts)  	WARN_ON(timekeeping_suspended);  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		ts->tv_sec = tk->xtime_sec;  		nsec = timekeeping_get_ns(tk);  		tomono = tk->wall_to_monotonic;  		sleep = tk->total_sleep_time; -	} while (read_seqretry(&tk->lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	ts->tv_sec += tomono.tv_sec + sleep.tv_sec;  	ts->tv_nsec = 0; @@ -1354,10 +1522,10 @@ struct timespec current_kernel_time(void)  	unsigned long seq;  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		now = tk_xtime(tk); -	} while (read_seqretry(&tk->lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	return now;  } @@ -1370,11 +1538,11 @@ struct timespec get_monotonic_coarse(void)  	unsigned long seq;  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		now = tk_xtime(tk);  		mono = tk->wall_to_monotonic; -	} while (read_seqretry(&tk->lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,  				now.tv_nsec + mono.tv_nsec); @@ -1405,11 +1573,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,  	unsigned long seq;  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		*xtim = tk_xtime(tk);  		*wtom = tk->wall_to_monotonic;  		*sleep = tk->total_sleep_time; -	} while (read_seqretry(&tk->lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  }  #ifdef CONFIG_HIGH_RES_TIMERS @@ -1421,7 +1589,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,   * Returns current monotonic time and updates the offsets   * Called from hrtimer_interupt() or retrigger_next_event()   */ -ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, +							ktime_t *offs_tai)  {  	struct timekeeper *tk = &timekeeper;  	ktime_t now; @@ -1429,14 +1598,15 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)  	u64 secs, nsecs;  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		secs = tk->xtime_sec;  		nsecs = timekeeping_get_ns(tk);  		*offs_real = tk->offs_real;  		*offs_boot = tk->offs_boot; -	} while (read_seqretry(&tk->lock, seq)); +		*offs_tai = tk->offs_tai; +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	now = ktime_add_ns(ktime_set(secs, 0), nsecs);  	now = ktime_sub(now, *offs_real); @@ -1454,15 +1624,79 @@ ktime_t ktime_get_monotonic_offset(void)  	struct timespec wtom;  	do { -		seq = read_seqbegin(&tk->lock); +		seq = read_seqcount_begin(&timekeeper_seq);  		wtom = tk->wall_to_monotonic; -	} while (read_seqretry(&tk->lock, seq)); +	} while (read_seqcount_retry(&timekeeper_seq, seq));  	return timespec_to_ktime(wtom);  }  EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);  /** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + */ +int do_adjtimex(struct timex *txc) +{ +	struct timekeeper *tk = &timekeeper; +	unsigned long flags; +	struct timespec ts; +	s32 orig_tai, tai; +	int ret; + +	/* Validate the data before disabling interrupts */ +	ret = ntp_validate_timex(txc); +	if (ret) +		return ret; + +	if (txc->modes & ADJ_SETOFFSET) { +		struct timespec delta; +		delta.tv_sec  = txc->time.tv_sec; +		delta.tv_nsec = txc->time.tv_usec; +		if (!(txc->modes & ADJ_NANO)) +			delta.tv_nsec *= 1000; +		ret = timekeeping_inject_offset(&delta); +		if (ret) +			return ret; +	} + +	getnstimeofday(&ts); + +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); + +	orig_tai = tai = tk->tai_offset; +	ret = __do_adjtimex(txc, &ts, &tai); + +	if (tai != orig_tai) { +		__timekeeping_set_tai_offset(tk, tai); +		clock_was_set_delayed(); +	} +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + +	return ret; +} + +#ifdef CONFIG_NTP_PPS +/** + * hardpps() - Accessor function to NTP __hardpps function + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ +	unsigned long flags; + +	raw_spin_lock_irqsave(&timekeeper_lock, flags); +	write_seqcount_begin(&timekeeper_seq); + +	__hardpps(phase_ts, raw_ts); + +	write_seqcount_end(&timekeeper_seq); +	raw_spin_unlock_irqrestore(&timekeeper_lock, flags); +} +EXPORT_SYMBOL(hardpps); +#endif + +/**   * xtime_update() - advances the timekeeping infrastructure   * @ticks:	number of ticks, that have elapsed since the last call.   * diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index af5a7e9f164..3bdf2832301 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -20,6 +20,13 @@  #include <asm/uaccess.h> + +struct timer_list_iter { +	int cpu; +	bool second_pass; +	u64 now; +}; +  typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);  DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); @@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)  	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);  	int i; -	SEQ_printf(m, "\n");  	SEQ_printf(m, "cpu: %d\n", cpu);  	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {  		SEQ_printf(m, " clock %d:\n", i); @@ -187,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)  #undef P  #undef P_ns +	SEQ_printf(m, "\n");  }  #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -195,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)  {  	struct clock_event_device *dev = td->evtdev; -	SEQ_printf(m, "\n");  	SEQ_printf(m, "Tick Device: mode:     %d\n", td->mode);  	if (cpu < 0)  		SEQ_printf(m, "Broadcast device\n"); @@ -230,12 +236,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)  	print_name_offset(m, dev->event_handler);  	SEQ_printf(m, "\n");  	SEQ_printf(m, " retries:        %lu\n", dev->retries); +	SEQ_printf(m, "\n");  } -static void timer_list_show_tickdevices(struct seq_file *m) +static void timer_list_show_tickdevices_header(struct seq_file *m)  { -	int cpu; -  #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST  	print_tickdevice(m, tick_get_broadcast_device(), -1);  	SEQ_printf(m, "tick_broadcast_mask: %08lx\n", @@ -246,47 +251,104 @@ static void timer_list_show_tickdevices(struct seq_file *m)  #endif  	SEQ_printf(m, "\n");  #endif -	for_each_online_cpu(cpu) -		print_tickdevice(m, tick_get_device(cpu), cpu); -	SEQ_printf(m, "\n");  } -#else -static void timer_list_show_tickdevices(struct seq_file *m) { }  #endif +static inline void timer_list_header(struct seq_file *m, u64 now) +{ +	SEQ_printf(m, "Timer List Version: v0.7\n"); +	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); +	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); +	SEQ_printf(m, "\n"); +} +  static int timer_list_show(struct seq_file *m, void *v)  { +	struct timer_list_iter *iter = v; +	u64 now = ktime_to_ns(ktime_get()); + +	if (iter->cpu == -1 && !iter->second_pass) +		timer_list_header(m, now); +	else if (!iter->second_pass) +		print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS +	else if (iter->cpu == -1 && iter->second_pass) +		timer_list_show_tickdevices_header(m); +	else +		print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif +	return 0; +} + +void sysrq_timer_list_show(void) +{  	u64 now = ktime_to_ns(ktime_get());  	int cpu; -	SEQ_printf(m, "Timer List Version: v0.7\n"); -	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); -	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); +	timer_list_header(NULL, now);  	for_each_online_cpu(cpu) -		print_cpu(m, cpu, now); +		print_cpu(NULL, cpu, now); -	SEQ_printf(m, "\n"); -	timer_list_show_tickdevices(m); +#ifdef CONFIG_GENERIC_CLOCKEVENTS +	timer_list_show_tickdevices_header(NULL); +	for_each_online_cpu(cpu) +		print_tickdevice(NULL, tick_get_device(cpu), cpu); +#endif +	return; +} -	return 0; +static void *timer_list_start(struct seq_file *file, loff_t *offset) +{ +	struct timer_list_iter *iter = file->private; + +	if (!*offset) { +		iter->cpu = -1; +		iter->now = ktime_to_ns(ktime_get()); +	} else if (iter->cpu >= nr_cpu_ids) { +#ifdef CONFIG_GENERIC_CLOCKEVENTS +		if (!iter->second_pass) { +			iter->cpu = -1; +			iter->second_pass = true; +		} else +			return NULL; +#else +		return NULL; +#endif +	} +	return iter;  } -void sysrq_timer_list_show(void) +static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset) +{ +	struct timer_list_iter *iter = file->private; +	iter->cpu = cpumask_next(iter->cpu, cpu_online_mask); +	++*offset; +	return timer_list_start(file, offset); +} + +static void timer_list_stop(struct seq_file *seq, void *v)  { -	timer_list_show(NULL, NULL);  } +static const struct seq_operations timer_list_sops = { +	.start = timer_list_start, +	.next = timer_list_next, +	.stop = timer_list_stop, +	.show = timer_list_show, +}; +  static int timer_list_open(struct inode *inode, struct file *filp)  { -	return single_open(filp, timer_list_show, NULL); +	return seq_open_private(filp, &timer_list_sops, +			sizeof(struct timer_list_iter));  }  static const struct file_operations timer_list_fops = {  	.open		= timer_list_open,  	.read		= seq_read,  	.llseek		= seq_lseek, -	.release	= single_release, +	.release	= seq_release_private,  };  static int __init init_timer_list_procfs(void) | 
